In [205]:
import pandas as pd
import json
import tensorflow as tf
import numpy as np
import evaluate

from datasets import load_dataset
from scipy.special import softmax
from scipy.special import expit

from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer


In [2]:
# helper functions

def tokenizerFunction(example):
    
    title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(example['title'], example['summary'])]
    
    return tokenizer(title_mod, example['genres'], padding = 'max_length', truncation = True)
        

In [3]:
# load datasets using hugging face

data_files = {
        "train" : "../datasets/training",
        "val" : "../datasets/validation",
        "test" : "../datasets/test",
        } 

training = load_dataset("json", data_files = data_files, split = "train")
validation = load_dataset("json", data_files = data_files, split = "val")
test = load_dataset("json", data_files = data_files, split = "test")

In [7]:
training

Dataset({
    features: ['title', 'summary', 'genres', 'label'],
    num_rows: 370940
})

In [79]:
training[0]

{'title': 'Beast',
 'summary': 'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.',
 'genres': 'Drama',
 'label': 2}

In [80]:
training['summary'][0]

'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.'

### Train using PyTorch Trainer API

In [10]:
# tokenize datasets using BERT tokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [10]:
# title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(training['title'], training['summary'])]

In [11]:
# tokenizerFunction(training[:5])

In [45]:
tokenized_training = training.map(tokenizerFunction, batched = True)
tokenized_validation = validation.map(tokenizerFunction, batched = True)

Map:   0%|          | 0/61824 [00:00<?, ? examples/s]

In [81]:
tokenized_training.shape

(370940, 7)

In [82]:
tokenized_validation.shape

(61824, 7)

In [54]:
# create a smaller subset of the training dataset to reduce the time it takes to fine-tune
small_training = tokenized_training.select(range(100))
small_evaluation = tokenized_validation.select(range(20))

#### Train

In [None]:
# define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and
# evaluation
# the model will be saved in the input directory

training_args = TrainingArguments(output_dir = "test_trainer")

In [84]:
# define the model

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [191]:
# compute_metrics() function to calculate a metric when evaluating the model during training 
# (otherwise the evaluation would just print the loss, which is not a very intuitive number).


def computeMetrics(eval_pred):
#     convert the logits to predictions before passing the predictions to compute
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
        
    accuracy = accuracy_score(y_true = labels, y_pred = predictions)
    balanced_accuracy = balanced_accuracy_score(y_true = labels, y_pred = predictions)
    precision = precision_score(y_true = labels, y_pred = predictions, average = 'macro')
    recall = recall_score(y_true = labels, y_pred = predictions, average = 'macro')
    f1 = f1_score(y_true = labels, y_pred = predictions, average = 'macro')
    
    return {'accuracy': accuracy, 
            'balanced_accuracy': balanced_accuracy, 
            'precision': precision, 
            'recall': recall, 
            'f1': f1,}
    

In [192]:
training_args = TrainingArguments(output_dir = "test_trainer", evaluation_strategy = "epoch")



In [193]:
# TRAINER
# define the trainer object
# use a small slice of the training dataset and validation dataset

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_training,
    eval_dataset = small_evaluation,
    compute_metrics = computeMetrics,
)

In [194]:
# fine-tune the model

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,Precision,Recall,F1
1,No log,2.002673,0.65,0.270833,0.254902,0.270833,0.262626
2,No log,2.12706,0.7,0.291667,0.259259,0.291667,0.27451
3,No log,1.824384,0.75,0.3125,0.263158,0.3125,0.285714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=39, training_loss=0.2194660137861203, metrics={'train_runtime': 844.7606, 'train_samples_per_second': 0.355, 'train_steps_per_second': 0.046, 'total_flos': 78934025318400.0, 'train_loss': 0.2194660137861203, 'epoch': 3.0})

In [209]:
trainer.evaluate(tokenized_training.select(range(2)))

{'eval_loss': 0.008453688584268093,
 'eval_accuracy': 1.0,
 'eval_balanced_accuracy': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.8725,
 'eval_samples_per_second': 2.292,
 'eval_steps_per_second': 1.146,
 'epoch': 3.0}

In [210]:
trainer.predict(tokenized_training.select(range(2)))

PredictionOutput(predictions=array([[-2.9955692, -0.2523113,  3.9158812],
       [-3.1896608,  5.384198 , -2.500912 ]], dtype=float32), label_ids=array([2, 1]), metrics={'test_loss': 0.008453688584268093, 'test_accuracy': 1.0, 'test_balanced_accuracy': 1.0, 'test_precision': 1.0, 'test_recall': 1.0, 'test_f1': 1.0, 'test_runtime': 0.8858, 'test_samples_per_second': 2.258, 'test_steps_per_second': 1.129})

In [197]:
trainer.predict(tokenized_training.select(range(2))).label_ids

array([2, 1])

In [198]:
predictions = trainer.predict(tokenized_training.select(range(2))).predictions
labels = trainer.predict(tokenized_training.select(range(2))).label_ids

In [199]:
labels

array([2, 1])

In [200]:
predictions

array([[-2.9955692, -0.2523113,  3.9158812],
       [-3.1896608,  5.384198 , -2.500912 ]], dtype=float32)

In [201]:
np.argmax(predictions, axis = -1)

array([2, 1])

In [206]:
classification_report(labels, np.argmax(predictions, axis = -1), output_dict = True, labels = [0, 1, 2], 
                      target_names = ['bad_rating', 'average_rating', 'good_rating'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'bad_rating': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 0.0},
 'average_rating': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 1.0},
 'good_rating': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 1.0},
 'micro avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 2.0},
 'macro avg': {'precision': 0.6666666666666666,
  'recall': 0.6666666666666666,
  'f1-score': 0.6666666666666666,
  'support': 2.0},
 'weighted avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 2.0}}

In [204]:
multilabel_confusion_matrix(labels, np.argmax(predictions, axis = -1))

array([[[1, 0],
        [0, 1]],

       [[1, 0],
        [0, 1]]])

In [213]:
trainer.save_model(output_dir = 'model/fine_tuned_BERT')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')