## Training

based on https://github.com/ShinyQ/Thesis_University-Feedback-Sentiment-Model_IndoBERT/blob/main/IndoBERT%20Classification%20Training.ipynb

In [1]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
#from datasets import load_metric

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(r"dataset/processed/processed_data_sentiment.csv")
df = df[['text_cleaned_stemmed', 'Type']]
df.rename(columns={"text_cleaned_stemmed": "text", "Type": "label"}, inplace=True)

In [3]:
df = df[['text', 'label']]
df.sample(5)

Unnamed: 0,text,label
1359,pangan lokal makin mati berkat ada makanbergiz...,Positif
4145,ka mau tanya kronologi kena uu ite nya gimana ...,Netral
3505,proyek makanbergizigratis bawa angin segar unt...,Positif
3999,program makanbergizigratis mbg telah mulai ind...,Positif
3167,lebih dari sekadar lezat mbg dorong inovasi pa...,Positif


In [4]:
df["label"].unique()

array(['Negatif', 'Positif', 'Netral'], dtype=object)

In [5]:
"""
plt.figure(figsize=(5, 5))
sns.countplot(x=df['text'])
plt.show()
"""

"\nplt.figure(figsize=(5, 5))\nsns.countplot(x=df['text'])\nplt.show()\n"

In [6]:
df["label"] = df["label"].map({"Negatif": 0, "Netral": 1, "Positif": 2})

In [7]:
RANDOM_SEED = 241

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_SEED,
)

df_val, df_test = train_test_split(
    df_test,
    test_size=0.1,
    random_state=RANDOM_SEED,
)

In [10]:
df_val.groupby("label").count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,257
1,103
2,455


In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(x=df_train['label'])
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(x=df_val['label'])
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(x=df_test['label'])
plt.show()

In [None]:
print(f'Total Train: {len(df_train)}')
print(f'Total Val  : {len(df_val)}')
print(f'Total Test : {len(df_test)}')

In [None]:
actual_label = df_test['label']

In [None]:
df_train.to_csv(r"dataset/train/train.csv", index=False)
df_val.to_csv(r"dataset/train/eval.csv", index=False)
df_test.to_csv(r"dataset/train/test.csv", index=False)

In [None]:
from datasets import load_dataset

files = {
    "train": r"dataset/train/train.csv", 
    "eval": r"dataset/train/eval.csv", 
    "test": r"dataset/train/test.csv",
}

dataset = load_dataset('csv', data_files=files)

## Tokenize Model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

In [None]:
df.head()

In [None]:
def tokenize_function(text):
    return tokenizer(text["text"], padding='max_length', max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]

## Train Model

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", num_labels=3)

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    "test_trainer", 
    per_device_train_batch_size=4,
    num_train_epochs=1
)

In [None]:
import evaluate

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
#metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
f1_metric = evaluate.load("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return f1_metric.compute(predictions=predictions, references=labels)


In [None]:
eval_dataset

In [None]:
torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

training_history = trainer.train()

In [None]:
trainer.state.log_history

In [None]:
steps = []
losses = []

for log in trainer.state.log_history:
    if "loss" in log: 
        steps.append(log["step"])
        losses.append(log["loss"])

plt.figure(figsize=(10, 5))
plt.plot(steps, losses, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training Loss over Steps")
plt.legend()
plt.grid()
plt.show()

In [None]:
evaluation_history = trainer.evaluate()
evaluation_history

In [None]:
test_dataset = tokenized_datasets["test"]

prediction = trainer.predict(test_dataset)
prediction = prediction.predictions.argmax(1)

In [None]:
print(classification_report(prediction, actual_label, target_names=["Negatif", "Netral", "Positif"]))

In [None]:
def show_confusion_matrix(confusion_matrix):
        hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
        hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
        hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')

        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')

cm = confusion_matrix(prediction, actual_label)
show_confusion_matrix(cm)

In [None]:
model.save_pretrained("model")

In [None]:
#one of [None, 'micro', 'macro', 'weighted']
f1_metric.compute(predictions=prediction, references=actual_label, average="weighted")