# Q5 - Context vectors using BERT (b-c)

## Installing Necessary Libraries and Data Cleaning

In [1]:
pip install convokit

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [4]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to C:\Users\sahre\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [5]:
utterances = corpus.get_utterances_dataframe()
utterances = utterances.sample(n=20000, random_state=42) #Sample random 20000 rows in utterances
conversations = corpus.get_conversations_dataframe()
conversations.rename(columns={'meta.movie_idx': 'meta.movie_id'}, inplace=True) #rename columns for convenience
speakers = corpus.get_speakers_dataframe()
speakers.rename(columns={'meta.movie_idx': 'meta.movie_id'}, inplace=True)

In [6]:
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "drama" if x == "['drama']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "horror" if x == "['horror']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "thriller" if x == "['thriller']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "comedy" if x == "['comedy']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "documentary" if x == "['documentary']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "romcom" if x == "['comedy', 'romance']" else x)

In [7]:
conversations = conversations.loc[conversations['meta.genre'].isin(['drama','horror','thriller', 'comedy','romcom','documentary'])]
conversations = conversations.drop(columns=['meta.release_year', 'meta.rating', 'meta.votes'], axis=1)
speakers = speakers.drop(columns=['meta.credit_pos', 'meta.gender', 'meta.character_name'], axis=1)

In [8]:
#Merging the three dataframes
import pandas as pd
merged_df = conversations.merge(speakers, on='meta.movie_id').merge(utterances, on='meta.movie_id')
from functools import reduce
dfs = [conversations, utterances, speakers]
final_df = dfs[0]
for df in dfs[1:]:
    final_df = pd.merge(final_df, df, on=['meta.movie_id'], how='inner')

In [9]:
#Sampling 10000 rows
final_df = final_df.sample(n=10000, random_state=42)

In [10]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
y = final_df['meta.genre']

#Split the data into training and the rest with 60:40 split
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0) 
for train_index, test_val_index in stratified_split.split(final_df, y):
    train = final_df.iloc[train_index]
    test_val = final_df.iloc[test_val_index]

#Split the remaining 40% data into testing and validation 50:50 split
y_test_val = y.iloc[test_val_index]
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
for test_index,val_index in stratified_split.split(test_val, y_test_val):
    test = test_val.iloc[test_index]
    val = test_val.iloc[val_index]

In [11]:
#to save the original unchanged
BERT_train = train.copy()
BERT_val = val.copy()
BERT_test = test.copy()

In [12]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

#evaluation function to print metrics

def evaluate(val, val_predictions):
    val_precision, val_recall, val_f1, support = precision_recall_fscore_support(val['meta.genre'], val_predictions, average='macro', zero_division=True)
    val_accuracy = (val['meta.genre'] == val_predictions).mean()
    print(f'Validation accuracy: {val_accuracy:.3f}')
    print(f'Validation precision (macro-averaged): {val_precision:.3f}')
    print(f'Validation recall (macro-averaged): {val_recall:.3f}')
    print(f'Validation F1-score (macro-averaged): {val_f1:.3f}')

In [15]:
pip install torch




In [16]:
from transformers import RobertaModel, RobertaTokenizer

In [96]:
from datasets import Dataset
import numpy as np
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
np.random.seed(42)

In [18]:
pip install pipeline

Note: you may need to restart the kernel to use updated packages.


## Trainer Function using Transformers

In [19]:
import transformers

In [97]:
#roberta model and tokeniser
from transformers import RobertaForSequenceClassification
num_labels = 6
model = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=num_labels)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


loading configuration file config.json from cache at C:\Users\sahre/.cache\huggingface\hub\models--roberta-base\snapshots\bc2764f8af2e92b6eb5679868df33e224075ca68\config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": 

In [165]:
#return values for labels
def get_tag_id(tag):
    tag_id_map = {'drama': 0, 'horror': 1, 'thriller': 2, 'comedy': 3, 'documentary': 4}
    return tag_id_map.get(tag, 5)

In [166]:
#encode text
train_encodings = tokenizer(list(BERT_train['text']), truncation=True, padding=True)
val_encodings = tokenizer(list(BERT_val['text']), truncation=True, padding=True)

In [167]:
y_Btrain = [get_tag_id(tag) for tag in BERT_train['meta.genre']]
y_Btrain = np.array(y_Btrain)

y_Bval = [get_tag_id(tag) for tag in BERT_val['meta.genre']]
y_Bval = np.array(y_Bval)

In [170]:
train_dataset = [{'input_ids': train_encodings['input_ids'][i], 
                  'attention_mask': train_encodings['attention_mask'][i], 
                  'labels': y_Btrain[i]} for i in range(len(BERT_train))]
val_dataset = [{'input_ids': val_encodings['input_ids'][i], 
                'attention_mask': val_encodings['attention_mask'][i], 
                'labels': y_Bval[i]} for i in range(len(BERT_val))]

In [168]:
train_features = {
    'input_ids': np.array([f['input_ids'] for f in train_dataset]), 
    'attention_mask': np.array([f['attention_mask'] for f in train_dataset]),
    'labels': np.array([f['labels'] for f in train_dataset])
}


In [169]:
val_features = {'input_ids': np.array([f['input_ids'] for f in val_dataset]), 
                'attention_mask': np.array([f['attention_mask'] for f in val_dataset]),
                'labels': np.array([f['labels'] for f in val_dataset])}


In [171]:
train_dataset = Dataset.from_dict(train_features)
val_dataset = Dataset.from_dict(val_features)

In [172]:
from transformers import Trainer

In [173]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0,
    max_steps=5,
    logging_steps = 700
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [174]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import numpy as np
def compute_metrics(pred):
    predictions, genre_labels = pred
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = genre_labels.flatten()
    indices = np.where(labels_flat != -100)
    preds = preds_flat[indices]
    true_labels = labels_flat[indices]
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
    accuracy = accuracy_score(true_labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [175]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


max_steps is given, it will override any value given in num_train_epochs


In [176]:
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 124650246


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.447133,0.4335,0.353099,0.390453,0.4335


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=0.9356801986694336, metrics={'train_runtime': 1159.8549, 'train_samples_per_second': 0.069, 'train_steps_per_second': 0.004, 'total_flos': 9497005721280.0, 'train_loss': 0.9356801986694336, 'epoch': 0.01})

## Tuning Hyperparamater 1

In [184]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0,
    max_steps=5,
    logging_steps = 700
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [185]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


max_steps is given, it will override any value given in num_train_epochs


In [186]:
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 124650246


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.942421,0.44,0.338055,0.475648,0.44


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=0.35717976093292236, metrics={'train_runtime': 1045.4999, 'train_samples_per_second': 0.077, 'train_steps_per_second': 0.005, 'total_flos': 9497005721280.0, 'train_loss': 0.35717976093292236, 'epoch': 0.01})

## Tuning Hyperparameter 2

In [190]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=0,
    weight_decay=0,
    max_steps=5,
    logging_steps = 700
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [191]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [192]:
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 124650246


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.578725,0.455,0.28457,0.207025,0.455


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=1.6278072357177735, metrics={'train_runtime': 1576.4727, 'train_samples_per_second': 0.051, 'train_steps_per_second': 0.003, 'total_flos': 9497005721280.0, 'train_loss': 1.6278072357177735, 'epoch': 0.01})

## Tuning Hyperparameter 3

In [22]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0,
    max_steps=5,
    logging_steps = 700
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [26]:
trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 124650246


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,1.53164,0.455,0.28457,0.207025,0.455


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=1.6109735488891601, metrics={'train_runtime': 871.2369, 'train_samples_per_second': 0.092, 'train_steps_per_second': 0.006, 'total_flos': 9497005721280.0, 'train_loss': 1.6109735488891601, 'epoch': 0.01})