# Q5 - Context vectors using BERT (a)

## Installing Necessary Libraries and Data Cleaning

In [1]:
pip install convokit

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [4]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to C:\Users\sahre\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [5]:
utterances = corpus.get_utterances_dataframe()
utterances = utterances.sample(n=20000, random_state=42) #Sample random 20000 rows in utterances
conversations = corpus.get_conversations_dataframe()
conversations.rename(columns={'meta.movie_idx': 'meta.movie_id'}, inplace=True) #rename columns for convenience
speakers = corpus.get_speakers_dataframe()
speakers.rename(columns={'meta.movie_idx': 'meta.movie_id'}, inplace=True)

In [6]:
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "drama" if x == "['drama']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "horror" if x == "['horror']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "thriller" if x == "['thriller']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "comedy" if x == "['comedy']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "documentary" if x == "['documentary']" else x)
conversations['meta.genre'] = conversations['meta.genre'].apply(lambda x: "romcom" if x == "['comedy', 'romance']" else x)

In [7]:
conversations = conversations.loc[conversations['meta.genre'].isin(['drama','horror','thriller', 'comedy','romcom','documentary'])]
conversations = conversations.drop(columns=['meta.release_year', 'meta.rating', 'meta.votes'], axis=1)
speakers = speakers.drop(columns=['meta.credit_pos', 'meta.gender', 'meta.character_name'], axis=1)

In [8]:
#Merging the three dataframes
import pandas as pd
merged_df = conversations.merge(speakers, on='meta.movie_id').merge(utterances, on='meta.movie_id')
from functools import reduce
dfs = [conversations, utterances, speakers]
final_df = dfs[0]
for df in dfs[1:]:
    final_df = pd.merge(final_df, df, on=['meta.movie_id'], how='inner')

In [9]:
#Sampling 10000 rows
final_df = final_df.sample(n=10000, random_state=42)

In [10]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
y = final_df['meta.genre']

#Split the data into training and the rest with 60:40 split
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0) 
for train_index, test_val_index in stratified_split.split(final_df, y):
    train = final_df.iloc[train_index]
    test_val = final_df.iloc[test_val_index]

#Split the remaining 40% data into testing and validation 50:50 split
y_test_val = y.iloc[test_val_index]
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
for test_index,val_index in stratified_split.split(test_val, y_test_val):
    test = test_val.iloc[test_index]
    val = test_val.iloc[val_index]

In [11]:
BERT_train = train.copy()
BERT_val = val.copy()
BERT_test = test.copy()

In [12]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [14]:
#Function to evaluate metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

def evaluate(val, val_predictions):
    val_precision, val_recall, val_f1, support = precision_recall_fscore_support(val['meta.genre'], val_predictions, average='macro', zero_division=True)
    val_accuracy = (val['meta.genre'] == val_predictions).mean()
    print(f'Validation accuracy: {val_accuracy:.3f}')
    print(f'Validation precision (macro-averaged): {val_precision:.3f}')
    print(f'Validation recall (macro-averaged): {val_recall:.3f}')
    print(f'Validation F1-score (macro-averaged): {val_f1:.3f}')

    


In [15]:
from transformers import RobertaModel, RobertaTokenizer

In [16]:

model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, classification_report, confusion_matrix
from transformers import pipeline
from transformers import RobertaTokenizer, TFRobertaModel

In [18]:
pip install pipeline

Note: you may need to restart the kernel to use updated packages.


## Feature Extractor using Pipeline

In [19]:
feature_extractor = pipeline('feature-extraction', model=model, tokenizer=tokenizer, truncation=True)

In [20]:
BERT_train.columns

Index(['vectors_x', 'meta.movie_id', 'meta.movie_name_x', 'meta.genre',
       'timestamp', 'text', 'speaker', 'reply_to', 'conversation_id',
       'meta.parsed', 'vectors_y', 'vectors', 'meta.movie_name_y'],
      dtype='object')

In [21]:
#Encodings for tokenisation
import numpy as np
X_train = np.array([feature_extractor(text)[0][0] for text in BERT_train['text']])
X_val = np.array([feature_extractor(text)[0][0] for text in BERT_val['text']])

## Logistic Regression

In [22]:
#Vectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

clf = LogisticRegression(random_state=0).fit(X_train, BERT_train['meta.genre'])

val_predictions = clf.predict(X_val)
evaluate(BERT_val, val_predictions)

Validation accuracy: 0.533
Validation precision (macro-averaged): 0.715
Validation recall (macro-averaged): 0.253
Validation F1-score (macro-averaged): 0.249


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
