In [18]:
import pandas as pd
import json
import tensorflow as tf
import numpy as np
import evaluate

from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer


In [2]:
# helper functions


def tokenizerFunction(example):
    
    title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(example['title'], example['summary'])]
    
    return tokenizer(title_mod, example['genres'], padding = 'max_length', truncation = True)
    
    
    

In [3]:
# load datasets using hugging face

data_files = {
        "train" : "../datasets/training",
        "val" : "../datasets/validation",
        "test" : "../datasets/test",
        } 

training = load_dataset("json", data_files = data_files, split = "train")
validation = load_dataset("json", data_files = data_files, split = "val")
test = load_dataset("json", data_files = data_files, split = "test")

In [7]:
training

Dataset({
    features: ['title', 'summary', 'genres', 'label'],
    num_rows: 370940
})

In [8]:
training[0]

{'title': 'Beast',
 'summary': 'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.',
 'genres': 'Drama',
 'label': 2}

In [9]:
training['summary'][0]

'The movie tells about the tragedy of miserable people who turned to dogs. A group of people who desperately seeks for a job is abducted and forced to labor like slaves.'

### Train using PyTorch Trainer API

In [10]:
# tokenize datasets using BERT tokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [10]:
# title_mod = [f"{t}<[SEP]>{s}" for t, s in zip(training['title'], training['summary'])]

In [11]:
# tokenizerFunction(training[:5])

In [45]:
tokenized_training = training.map(tokenizerFunction, batched = True)
tokenized_validation = validation.map(tokenizerFunction, batched = True)

Map:   0%|          | 0/61824 [00:00<?, ? examples/s]

In [35]:
tokenized_training.shape

(370940, 7)

In [47]:
tokenized_validation.shape

(61824, 7)

In [54]:
# create a smaller subset of the training dataset to reduce the time it takes to fine-tune
small_training = tokenized_training.select(range(100))
small_evaluation = tokenized_validation.select(range(20))

#### Train

In [55]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
training_args = TrainingArguments(output_dir = "test_trainer")

In [57]:
# EVALUATE

# Trainer does not automatically evaluate model performance during training. 
# the functions computeMetrics defined below will be passed to Trainer to compute and report metrics

metric = evaluate.load("accuracy")

In [58]:
# call compute on metric to calculate the accuracy of the predictions

def computeMetrics(eval_pred):
#     convert the logits to predictions before passing the predictions to compute
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    
    return metric.compute(predictions = predictions, references = labels)


In [None]:
#   def compute_metrics(eval_pred):

#       logits, labels = eval_pred
#       probabilities = expit(logits)
#       predictions = (probabilities > 0.5).astype(int)

#       res = classification_report(labels, predictions, output_dict = True, zero_division = 0, target_names = genre_list)

#       confusion = multilabel_confusion_matrix(labels, predictions)

#       macro_mcc = 0
#       micro_mcc = compute_mcc(np.sum(confusion, axis = 0))

#       for class_name, conf_matrix in zip(genre_list, confusion):

#          class_mcc = compute_mcc(conf_matrix)

#          res[class_name]["mcc"] = class_mcc
#          macro_mcc += class_mcc

#       macro_mcc = macro_mcc / len(genre_list)

#       res["macro avg"]["mcc"] = macro_mcc
#       res["micro avg"]["mcc"] = micro_mcc

#       logging.info(res)

#       return res

In [59]:
training_args = TrainingArguments(output_dir = "test_trainer", evaluation_strategy = "epoch")

In [60]:
# TRAINER
# use a small slice of the training dataset and validation dataset

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_training,
    eval_dataset = small_evaluation,
    compute_metrics = computeMetrics,
)

In [61]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.705717,0.8
2,No log,0.747443,0.8
3,No log,0.688825,0.8


TrainOutput(global_step=39, training_loss=0.7716258611434546, metrics={'train_runtime': 784.8856, 'train_samples_per_second': 0.382, 'train_steps_per_second': 0.05, 'total_flos': 78934025318400.0, 'train_loss': 0.7716258611434546, 'epoch': 3.0})