# Custom NER with BERT on MIT Movie Corpus

In [None]:
# Import necessary library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')
# this creates a symbolic link so that now the path /content/gdrive/My\ Drive/ is equal to /mydrive
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive/MIT_Movie_corpus_NER

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
ln: failed to create symbolic link '/mydrive/My Drive': File exists
 BERT_NER.ipynb  'MIT Movie Corpus.txt'


Data can be downloaded from https://groups.csail.mit.edu/sls/downloads/movie/

In [None]:
!pip install simpletransformers



## Data Preparation

In [None]:
# Read data
with open('/mydrive/MIT_Movie_corpus_NER/MIT Movie Corpus.txt','r') as file:
    df_list = file.readlines()
df_list[:50]

['B-Actor\tsteve\n',
 'I-Actor\tmcqueen\n',
 'O\tprovided\n',
 'O\ta\n',
 'B-Plot\tthrilling\n',
 'I-Plot\tmotorcycle\n',
 'I-Plot\tchase\n',
 'I-Plot\tin\n',
 'I-Plot\tthis\n',
 'B-Opinion\tgreatest\n',
 'I-Opinion\tof\n',
 'I-Opinion\tall\n',
 'B-Plot\tww\n',
 'I-Plot\t2\n',
 'I-Plot\tprison\n',
 'I-Plot\tescape\n',
 'I-Plot\tmovies\n',
 '\n',
 'B-Actor\tliza\n',
 'I-Actor\tminnelli\n',
 'O\tand\n',
 'B-Actor\tjoel\n',
 'I-Actor\tgray\n',
 'B-Award\twon\n',
 'I-Award\toscars\n',
 'O\tfor\n',
 'O\ttheir\n',
 'O\troles\n',
 'O\tin\n',
 'O\tthis\n',
 'B-Year\t1972\n',
 'O\tmovie\n',
 'B-Plot\tthat\n',
 'I-Plot\tfollows\n',
 'I-Plot\tnightclub\n',
 'I-Plot\tentertainers\n',
 'I-Plot\tin\n',
 'I-Plot\tberlin\n',
 'I-Plot\tas\n',
 'I-Plot\tthe\n',
 'I-Plot\tnazis\n',
 'I-Plot\tcome\n',
 'I-Plot\tto\n',
 'I-Plot\tpower\n',
 '\n',
 'O\twhat\n',
 'O\tis\n',
 'O\tthat\n',
 'B-Actor\ttom\n',
 'I-Actor\thanks\n']

In [None]:
# Add sentence id
sentense_id = 0
new_df_list = []
for line in df_list:
    if line == '\n':
        sentense_id +=1
    else:
        new_df_list.append(str(sentense_id)+'\t'+line)
new_df_list[:50]

['0\tB-Actor\tsteve\n',
 '0\tI-Actor\tmcqueen\n',
 '0\tO\tprovided\n',
 '0\tO\ta\n',
 '0\tB-Plot\tthrilling\n',
 '0\tI-Plot\tmotorcycle\n',
 '0\tI-Plot\tchase\n',
 '0\tI-Plot\tin\n',
 '0\tI-Plot\tthis\n',
 '0\tB-Opinion\tgreatest\n',
 '0\tI-Opinion\tof\n',
 '0\tI-Opinion\tall\n',
 '0\tB-Plot\tww\n',
 '0\tI-Plot\t2\n',
 '0\tI-Plot\tprison\n',
 '0\tI-Plot\tescape\n',
 '0\tI-Plot\tmovies\n',
 '1\tB-Actor\tliza\n',
 '1\tI-Actor\tminnelli\n',
 '1\tO\tand\n',
 '1\tB-Actor\tjoel\n',
 '1\tI-Actor\tgray\n',
 '1\tB-Award\twon\n',
 '1\tI-Award\toscars\n',
 '1\tO\tfor\n',
 '1\tO\ttheir\n',
 '1\tO\troles\n',
 '1\tO\tin\n',
 '1\tO\tthis\n',
 '1\tB-Year\t1972\n',
 '1\tO\tmovie\n',
 '1\tB-Plot\tthat\n',
 '1\tI-Plot\tfollows\n',
 '1\tI-Plot\tnightclub\n',
 '1\tI-Plot\tentertainers\n',
 '1\tI-Plot\tin\n',
 '1\tI-Plot\tberlin\n',
 '1\tI-Plot\tas\n',
 '1\tI-Plot\tthe\n',
 '1\tI-Plot\tnazis\n',
 '1\tI-Plot\tcome\n',
 '1\tI-Plot\tto\n',
 '1\tI-Plot\tpower\n',
 '2\tO\twhat\n',
 '2\tO\tis\n',
 '2\tO\tthat\n',

In [None]:
# Save the transformed data
with open('Converted_MIT_Movie_Corpus_to_BIO_format.csv','w') as file:
    file.writelines(new_df_list)

In [None]:
# Read data
df = pd.read_csv('Converted_MIT_Movie_Corpus_to_BIO_format.csv',sep='\t',names=['sentence_id','labels','words'])[['sentence_id','words','labels']]
df

Unnamed: 0,sentence_id,words,labels
0,0,steve,B-Actor
1,0,mcqueen,I-Actor
2,0,provided,O
3,0,a,O
4,0,thrilling,B-Plot
...,...,...,...
158818,7815,on,I-Origin
158819,7815,a,I-Origin
158820,7815,nicholas,I-Origin
158821,7815,sparks,I-Origin


In [None]:
# Remove null values
df = df[df['words'].notnull()]

In [None]:
df['labels'].value_counts()

I-Plot              62107
O                   55895
B-Plot               6468
I-Actor              6121
B-Actor              5010
B-Genre              3384
I-Origin             3340
B-Year               2702
I-Genre              2283
B-Director           1787
I-Director           1653
I-Relationship       1206
B-Character_Name     1024
I-Quote               817
B-Opinion             810
B-Origin              779
I-Character_Name      760
I-Award               719
B-Relationship        580
I-Opinion             539
B-Award               309
I-Year                195
I-Soundtrack          158
B-Quote               126
B-Soundtrack           50
Name: labels, dtype: int64

In [None]:
df[df['sentence_id'] == 6000]

Unnamed: 0,sentence_id,words,labels
122247,6000,im,O
122248,6000,thinking,O
122249,6000,of,O
122250,6000,the,O
122251,6000,sci,B-Genre
122252,6000,fi,I-Genre
122253,6000,horror,I-Genre
122254,6000,movie,I-Genre
122255,6000,has,O
122256,6000,discovery,B-Plot


In [None]:
# Train test split
df_train = df.iloc[:122269,:]
df_test = df.iloc[122269:,:]
print(df_train.shape,df_test.shape)

(122269, 3) (36553, 3)


In [None]:
df_train

Unnamed: 0,sentence_id,words,labels
0,0,steve,B-Actor
1,0,mcqueen,I-Actor
2,0,provided,O
3,0,a,O
4,0,thrilling,B-Plot
...,...,...,...
122265,6000,site,I-Plot
122266,6000,starring,O
122267,6000,mary,B-Actor
122268,6000,elizabeth,I-Actor


In [None]:
df_test

Unnamed: 0,sentence_id,words,labels
122270,6001,im,O
122271,6001,thinking,O
122272,6001,of,O
122273,6001,the,O
122274,6001,third,B-Relationship
...,...,...,...
158818,7815,on,I-Origin
158819,7815,a,I-Origin
158820,7815,nicholas,I-Origin
158821,7815,sparks,I-Origin


In [None]:
label = df["labels"].unique().tolist()
label

['B-Actor',
 'I-Actor',
 'O',
 'B-Plot',
 'I-Plot',
 'B-Opinion',
 'I-Opinion',
 'B-Award',
 'I-Award',
 'B-Year',
 'B-Genre',
 'B-Origin',
 'I-Origin',
 'B-Director',
 'I-Director',
 'I-Genre',
 'I-Year',
 'B-Soundtrack',
 'I-Soundtrack',
 'B-Relationship',
 'I-Relationship',
 'B-Character_Name',
 'I-Character_Name',
 'B-Quote',
 'I-Quote']

## Model training and evaluation

In [None]:
from simpletransformers.ner import NERModel , NERArgs

In [None]:
model_args = NERArgs()
model_args.num_train_epochs = 2
model_args.learning_rate = 1e-4
model_args.overwrite_output_dir =True
model_args.train_batch_size = 32
model_args.eval_batch_size = 8
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True

### BERT

In [None]:
model_bert = NERModel('bert', 'bert-base-cased', labels=label, args = model_args)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
model_bert.train_model(df_train,eval_data = df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/188 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

(376,
 {'eval_loss': [0.3644317648156099, 0.32709936003327894],
  'f1_score': [0.7136417556346382, 0.7504427628401222],
  'global_step': [188, 376],
  'precision': [0.6856100896520285, 0.7330921673482227],
  'recall': [0.7440633245382586, 0.7686345646437994],
  'train_loss': [0.296103835105896, 0.20454014837741852]})

In [None]:
result_bert, model_outputs, preds_list = model_bert.eval_model(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

In [None]:
result_bert

{'eval_loss': 0.32709936003327894,
 'f1_score': 0.7504427628401222,
 'precision': 0.7330921673482227,
 'recall': 0.7686345646437994}

In [None]:
prediction, model_output = model_bert.predict(["What 2011 animated movie starred the voices of johnny deep and rahul poddar"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
prediction

[[{'What': 'O'},
  {'2011': 'B-Year'},
  {'animated': 'B-Genre'},
  {'movie': 'O'},
  {'starred': 'O'},
  {'the': 'O'},
  {'voices': 'O'},
  {'of': 'O'},
  {'johnny': 'B-Actor'},
  {'deep': 'I-Actor'},
  {'and': 'O'},
  {'rahul': 'B-Actor'},
  {'poddar': 'I-Actor'}]]

### DISTILBERT

In [None]:
model_distilbert = NERModel('distilbert', 'distilbert-base-cased', labels=label, args = model_args)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [None]:
model_distilbert.train_model(df_train,eval_data = df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/188 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

(376,
 {'eval_loss': [0.35073166360282687, 0.33242911034743694],
  'f1_score': [0.7213774237669965, 0.7317859445519022],
  'global_step': [188, 376],
  'precision': [0.7043205027494108, 0.7156368221941992],
  'recall': [0.7392810026385225, 0.7486807387862797],
  'train_loss': [0.37429410219192505, 0.20431213080883026]})

In [None]:
result_distilbert, model_outputs, preds_list = model_distilbert.eval_model(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

In [None]:
result_distilbert

{'eval_loss': 0.33242911034743694,
 'f1_score': 0.7317859445519022,
 'precision': 0.7156368221941992,
 'recall': 0.7486807387862797}

### ROBERTA

In [None]:
model_roberta = NERModel('roberta', 'roberta-base', labels=label, args = model_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

In [None]:
model_roberta.train_model(df_train,eval_data = df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/188 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

(376,
 {'eval_loss': [0.37698345723262444, 0.33136952405852893],
  'f1_score': [0.7088967971530249, 0.7443657437218287],
  'global_step': [188, 376],
  'precision': [0.6810515119282784, 0.7270440251572327],
  'recall': [0.7391160949868074, 0.762532981530343],
  'train_loss': [0.30008870363235474, 0.22751405835151672]})

In [None]:
result_roberta, model_outputs, preds_list = model_roberta.eval_model(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/227 [00:00<?, ?it/s]

In [None]:
result_roberta

{'eval_loss': 0.33136952405852893,
 'f1_score': 0.7443657437218287,
 'precision': 0.7270440251572327,
 'recall': 0.762532981530343}