In [182]:
import pandas as pd
import json
import tensorflow as tf
import numpy as np
import evaluate

from datasets import load_dataset
from scipy.special import softmax
from scipy.special import expit

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer


In [2]:
# helper functions


def tokenizerFunction(example):
    
    title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(example['title'], example['summary'])]
    
    return tokenizer(title_mod, example['genres'], padding = 'max_length', truncation = True)
    
    
    

In [3]:
# load datasets using hugging face

data_files = {
        "train" : "../datasets/training",
        "val" : "../datasets/validation",
        "test" : "../datasets/test",
        } 

training = load_dataset("json", data_files = data_files, split = "train")
validation = load_dataset("json", data_files = data_files, split = "val")
test = load_dataset("json", data_files = data_files, split = "test")

In [7]:
training

Dataset({
    features: ['title', 'summary', 'genres', 'label'],
    num_rows: 370940
})

In [79]:
training[0]

{'title': 'Beast',
 'summary': 'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.',
 'genres': 'Drama',
 'label': 2}

In [80]:
training['summary'][0]

'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.'

### Train using PyTorch Trainer API

In [10]:
# tokenize datasets using BERT tokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [10]:
# title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(training['title'], training['summary'])]

In [11]:
# tokenizerFunction(training[:5])

In [45]:
tokenized_training = training.map(tokenizerFunction, batched = True)
tokenized_validation = validation.map(tokenizerFunction, batched = True)

Map:   0%|          | 0/61824 [00:00<?, ? examples/s]

In [81]:
tokenized_training.shape

(370940, 7)

In [82]:
tokenized_validation.shape

(61824, 7)

In [54]:
# create a smaller subset of the training dataset to reduce the time it takes to fine-tune
small_training = tokenized_training.select(range(100))
small_evaluation = tokenized_validation.select(range(20))

#### Train

In [None]:
# define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and
# evaluation
# the model will be saved in the input directory

training_args = TrainingArguments(output_dir = "test_trainer")

In [84]:
# define the model

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [176]:
# EVALUATE

# Trainer does not automatically evaluate model performance during training. 
# the functions computeMetrics defined below will be passed to Trainer to compute and report metrics

# define the metrics 

# metric_1 = evaluate.load("hyperml/balanced_accuracy")
# metric_2 = evaluate.load("precision", average = "macro")
# metric_3 = evaluate.load("recall")
# metric_4 = evaluate.load("f1")

In [191]:
# call compute on metric to calculate the accuracy of the predictions

def computeMetrics(eval_pred):
#     convert the logits to predictions before passing the predictions to compute
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    
#     balanced_accuracy = metric_1.compute(predictions = predictions, references = labels)['balanced_accuracy']
#     precision = metric_2.compute(predictions = predictions, references = labels)['precision']
#     recall = metric_3.compute(predictions = predictions, references = labels)['recall']
#     f1 = metric_4.compute(predictions = predictions, references = labels)['f1']
    
    accuracy = accuracy_score(y_true = labels, y_pred = predictions)
    balanced_accuracy = balanced_accuracy_score(y_true = labels, y_pred = predictions)
    precision = precision_score(y_true = labels, y_pred = predictions, average = 'macro')
    recall = recall_score(y_true = labels, y_pred = predictions, average = 'macro')
    f1 = f1_score(y_true = labels, y_pred = predictions, average = 'macro')
    
    return {'accuracy': accuracy, 
            'balanced_accuracy': balanced_accuracy, 
            'precision': precision, 
            'recall': recall, 
            'f1': f1,}
    

In [178]:
#   def compute_metrics(eval_pred):

#       logits, labels = eval_pred
#       probabilities = expit(logits)
#       predictions = (probabilities > 0.5).astype(int)

#       res = classification_report(labels, predictions, output_dict = True, zero_division = 0, target_names = genre_list)

#       confusion = multilabel_confusion_matrix(labels, predictions)

#       macro_mcc = 0
#       micro_mcc = compute_mcc(np.sum(confusion, axis = 0))

#       for class_name, conf_matrix in zip(genre_list, confusion):

#          class_mcc = compute_mcc(conf_matrix)

#          res[class_name]["mcc"] = class_mcc
#          macro_mcc += class_mcc

#       macro_mcc = macro_mcc / len(genre_list)

#       res["macro avg"]["mcc"] = macro_mcc
#       res["micro avg"]["mcc"] = micro_mcc

#       logging.info(res)

#       return res

In [192]:
training_args = TrainingArguments(output_dir = "test_trainer", evaluation_strategy = "epoch")



In [193]:
# TRAINER
# define the trainer object
# use a small slice of the training dataset and validation dataset

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_training,
    eval_dataset = small_evaluation,
    compute_metrics = computeMetrics,
)

In [None]:
# fine-tune the model

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,Precision,Recall,F1
1,No log,2.002673,0.65,0.270833,0.254902,0.270833,0.262626
2,No log,2.12706,0.7,0.291667,0.259259,0.291667,0.27451


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [169]:
trainer.evaluate(tokenized_training.select(range(2)))

{'eval_loss': 1.7979985475540161,
 'eval_accuracy': 0.5,
 'eval_runtime': 1.2484,
 'eval_samples_per_second': 1.602,
 'eval_steps_per_second': 0.801,
 'epoch': 3.0}

In [117]:
trainer.predict(tokenized_training.select(range(2)))

PredictionOutput(predictions=array([[-1.2641485,  2.467274 , -1.0243659],
       [-1.2568437,  2.4920769, -1.0423361]], dtype=float32), label_ids=array([2, 1]), metrics={'test_loss': 1.7979985475540161, 'test_accuracy': 0.5, 'test_runtime': 0.8578, 'test_samples_per_second': 2.332, 'test_steps_per_second': 1.166})

In [121]:
trainer.predict(tokenized_training.select(range(2))).label_ids

array([2, 1])

In [146]:
predictions = trainer.predict(tokenized_training.select(range(2))).predictions
labels = trainer.predict(tokenized_training.select(range(2))).label_ids

In [153]:
labels

array([2, 1])

In [145]:
predictions

array([[-1.2641485,  2.467274 , -1.0243659],
       [-1.2568437,  2.4920769, -1.0423361]], dtype=float32)

In [158]:
expit(predictions)

array([[0.22026058, 0.9218155 , 0.26417786],
       [0.22151771, 0.9235845 , 0.2606995 ]], dtype=float32)

In [160]:
(expit(predictions) > 0.5).astype(int)

array([[0, 1, 0],
       [0, 1, 0]])

In [163]:
np.argmax(predictions, axis = -1)

array([1, 1])

In [167]:
classification_report(labels, np.argmax(predictions, axis = -1), output_dict = True, labels = [0, 1, 2], 
                      target_names = ['bad', 'average', 'good'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'bad': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0},
 'average': {'precision': 0.5,
  'recall': 1.0,
  'f1-score': 0.6666666666666666,
  'support': 1.0},
 'good': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0},
 'micro avg': {'precision': 0.5,
  'recall': 0.5,
  'f1-score': 0.5,
  'support': 2.0},
 'macro avg': {'precision': 0.16666666666666666,
  'recall': 0.3333333333333333,
  'f1-score': 0.2222222222222222,
  'support': 2.0},
 'weighted avg': {'precision': 0.25,
  'recall': 0.5,
  'f1-score': 0.3333333333333333,
  'support': 2.0}}

In [116]:
np.argmax(predictions, axis = -1)

array([1, 1])

In [168]:
multilabel_confusion_matrix(labels, np.argmax(predictions, axis = -1))

array([[[0, 1],
        [0, 1]],

       [[1, 0],
        [1, 0]]])