In [None]:
import json
import numpy as np
import evaluate

from datasets import load_dataset
from scipy.special import softmax
from scipy.special import expit

from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer

In [None]:
# helper functions

def tokenizerFunction(example):
    
    title_mod = [f"{t}[SEP]{s}" for t, s in zip(example['title'], example['summary'])]
    
    return tokenizer(title_mod, example['genres'], padding = 'max_length', truncation = True)

In [None]:
# load datasets using hugging face

data_files = {
        'train' : '../datasets/training',
        'val' : '../datasets/validation',
        'test' : '../datasets/test',
        } 

training = load_dataset('json', data_files = data_files, split = 'train')
validation = load_dataset('json', data_files = data_files, split = 'val')
test = load_dataset('json', data_files = data_files, split = 'test')

In [None]:
# train using PyTorch Trainer API

# tokenize datasets using DeBERTaV3 tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

tokenized_training = training.map(tokenizerFunction, batched = True)
tokenized_validation = validation.map(tokenizerFunction, batched = True)

In [None]:
# define the model

model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels = 3)

In [None]:
# define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and
# evaluation
# the model will be saved in the input directory

training_args = TrainingArguments(output_dir = "test_trainer", evaluation_strategy = "epoch")

In [None]:
# compute_metrics() function to calculate a metric when evaluating the model during training 
# (otherwise the evaluation would just print the loss, which is not a very intuitive number).


def computeMetrics(eval_pred):
#     convert the logits to predictions before passing the predictions to compute
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
        
    accuracy = accuracy_score(y_true = labels, y_pred = predictions)
    balanced_accuracy = balanced_accuracy_score(y_true = labels, y_pred = predictions)
    precision = precision_score(y_true = labels, y_pred = predictions, average = 'macro')
    recall = recall_score(y_true = labels, y_pred = predictions, average = 'macro')
    f1 = f1_score(y_true = labels, y_pred = predictions, average = 'macro')
    classification_report = classification_report(labels, np.argmax(predictions, axis = -1), 
                                                  output_dict = True, labels = [0, 1, 2], 
                                                  target_names = ['bad_rating', 'average_rating', 'good_rating'])
    
    return {
        'accuracy': accuracy, 
        'balanced_accuracy': balanced_accuracy, 
        'precision': precision, 
        'recall': recall, 
        'f1': f1, 
        'classification_report': classification_report,
           }

In [None]:
# TRAINER
# define the trainer object

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_training,
    eval_dataset = tokenized_validation,
    compute_metrics = computeMetrics,
)

In [None]:
# fine-tune the model

trainer.train()

In [None]:
trainer.save_model(output_dir = 'model/fine_tuned_DeBERTaV3')
tokenizer.save_pretrained('model/tokenizer')