In [1]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import TrainerCallback
from datasets import load_dataset
from transformers import AutoTokenizer
from datetime import datetime

from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import evaluate
import json

import torch
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
files = {
    "train": r"dataset/train/train.csv", 
    "eval": r"dataset/train/eval.csv", 
    "test": r"dataset/train/test.csv",
}

dataset = load_dataset('csv', data_files=files)

## Tokenizer and Model load

In [4]:
#"indolem/indobert-base-uncased" ==> can't predict class 1

#indolem/indobert-base-uncased
#ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa
#cahya/bert-base-indonesian-522M
#google-bert/bert-base-uncased

# key value pair. key will be supplied to model_choice variable
model_name = {"indobert_base":"indobenchmark/indobert-base-p2",
              "ayamerushia":"ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa",
              "cahya":"cahya/bert-base-indonesian-522M",
              "google-bert":"google-bert/bert-base-uncased"}

In [5]:
NUMEPOCHTRAIN = 10
model_choice = "cahya"
model_serial = model_choice + "_" + datetime.strftime(datetime.now(),format="%y%m%d-%H%M%S")

print(model_serial)

cahya_250209-183047


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name[model_choice])

In [7]:
def tokenize_function(text):
    return tokenizer(text["text"], padding='max_length', max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [8]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name[model_choice], 
                                                           num_labels=3,
                                                           hidden_dropout_prob=0.3, 
                                                           attention_probs_dropout_prob=0.3
                                                           )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-522M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    "test_trainer_"+model_serial, 
    per_device_train_batch_size=4, #no effect on training time
    per_device_eval_batch_size=4, # no effect on training time
    num_train_epochs=NUMEPOCHTRAIN,
    #label_names=["label"],
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    metric_for_best_model="eval_loss", 
    greater_is_better=False,
    save_total_limit = 2,
    load_best_model_at_end=False,
    save_strategy = "epoch",
    #save_strategy = "no",
    #report_to="tensorboard"
)



In [11]:
accuracy = evaluate.load("accuracy")

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
f1_metric = evaluate.load("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return f1_metric.compute(predictions=predictions, references=labels)

In [14]:
class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

## Start Training

In [15]:
# Start TensorBoard before training to monitor it in progress
#%load_ext tensorboard
#%tensorboard --logdir '{model_output_dir}'/runs

In [None]:
torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

#trainer.add_callback(CustomCallback(trainer))

training_history = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8534,0.991715,0.740872
2,0.7567,1.080311,0.706714
3,0.6622,1.092546,0.760895
4,0.5767,1.109026,0.760895
5,0.4868,1.200353,0.773852
6,0.409,1.303684,0.763251
7,0.3184,1.399158,0.758539
8,0.2773,1.471204,0.772674


In [None]:
# After training, access the path of the best checkpoint like this
best_ckpt_path = trainer.state.best_model_checkpoint

In [None]:
best_ckpt_path

In [None]:
#trainer.evaluate()

In [None]:
trainer.state.log_history

In [None]:
epochs = []
steps = []
losses = []
val_losses = []

for log in trainer.state.log_history:
    if "eval_loss" in log: 
        epochs.append(log["epoch"])
        #losses.append(log["loss"])
        val_losses.append(log["eval_loss"])
    if "loss" in log: 
        #steps.append(log["epoch"])
        #losses.append(log["loss"])
        losses.append(log["loss"])

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(epochs, losses, label="training loss")
plt.plot(epochs, val_losses, label="validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss over Epochs")
plt.legend()
plt.grid()
plt.show()

In [None]:
model.save_pretrained("model_"+model_serial)

## Evaluation

In [None]:
test_dataset = tokenized_datasets["test"]

prediction = trainer.predict(test_dataset)
prediction = prediction.predictions.argmax(1)

In [None]:
pd.DataFrame(prediction).to_csv(r"result_prediction/prediction_"+model_serial+'.csv',index=False)

In [None]:
df_test=pd.read_csv(r"dataset/train/test.csv")
actual_label = df_test['label']

In [None]:
print(classification_report(prediction, actual_label, target_names=["Negatif", "Netral", "Positif"]))

In [None]:
def show_confusion_matrix(confusion_matrix):
        hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
        hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
        hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')

        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')

cm = confusion_matrix(prediction, actual_label)
show_confusion_matrix(cm)

In [None]:
f1_metric.compute(predictions=prediction, references=actual_label, average="weighted")

## Save Training Result

In [None]:
save_dict = {}
save_dict["trainer_history"] = trainer.state.log_history
save_dict["trainer_args"] = trainer.args.to_dict()
save_dict["trainer_model_config"] = trainer.model.config.to_dict()

In [None]:
with open(r"result_model/result_"+model_serial+'.json', 'w', encoding='utf-8') as f:
    json.dump(save_dict, f, ensure_ascii=False, indent=4)

In [None]:
model_serial