In [11]:
! pip install transformers # transformers library from huggingface
! pip install datasets # datasets library from huggingface

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
import os
import torch
import csv
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments

# Define the available device.
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.device(device)

device(type='cpu')

In [13]:
# git lfs install
!git clone https://huggingface.co/datasets/rungalileo/MIT_movies_fixed

fatal: destination path 'MIT_movies_fixed' already exists and is not an empty directory.


In [73]:
# Showing some examples
df = pd.read_csv("MIT_movies_fixed/MIT_movies_fixed_train.tsv", sep="\t", header=None, quoting=csv.QUOTE_NONE)
df.head(60)

Unnamed: 0,0,1
0,what,O
1,movies,O
2,star,O
3,bruce,B-ACTOR
4,willis,E-ACTOR
5,show,O
6,me,O
7,films,O
8,with,O
9,drew,B-ACTOR


In [74]:
class MyPOSTaggingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.labels)

In [75]:
def alignLabels(labels, encodedData):
  """
  This function aligns labels with subwords because length of labels is not the same as length of encoded data (because of subwords).
  """
  alignedLabels = []

  for i, tags in enumerate(labels): # Loop over sentences
    wordIds = encodedData.word_ids(i) # get real word indices of the sentence i.
    previousWordId = None
    alignedTags = []

    for wordId in wordIds: # Loop over tokens
      if wordId is None or wordId == previousWordId: # If it is special token (word id == None) or is 2nd/3rd subword.
        alignedTags.append(-100) # then labeled as -100.
      else: # If it is not special token and is 1st subword.
        alignedTags.append(tags[wordId]) # then labeled as usual.

      previousWordId = wordId

    alignedLabels.append(alignedTags)

  return alignedLabels

In [93]:
def loadDataset(type, tagList):
  """
  This function loads the dataset.
  type: "train" and "test".
  tagList: list of tag categories.
  """

  sentences = []
  labels = []
  with open(f"MIT_movies_fixed/MIT_movies_fixed_{type}.tsv") as lines: # Read the document.
    sentence = []
    tags = []
    for line in lines: # Loop over lines. One line contains a token and its tag.
      line = line.strip("\n") # IMPORTANT!
      if len(line) == 0: # If it's an end of a sentence.
        if len(sentence) > 0:
          # Pool the tokens and their labels.
          sentences.append(sentence)
          labels.append(tags)
        # Re-init new sentence.
        sentence = []
        tags = []
      else: # If it's not an end of a sentence.
        token, tag = line.split()
        # Pool a token and its label.
        sentence.append(token)
        tags.append(tagList.index(tag))

  return sentences, labels

In [96]:
trainDf = pd.read_csv("MIT_movies_fixed/MIT_movies_fixed_train.tsv", sep="\t", header=None)
testDf = pd.read_csv("MIT_movies_fixed/MIT_movies_fixed_test.tsv", sep="\t", header=None)

# Pooling unique tags.
TAGSET = set()
for key, rows in trainDf.iterrows():
  TAGSET.add(rows[1])

for key, rows in testDf.iterrows():
  TAGSET.add(rows[1])

TAGLIST = list(TAGSET)
print(TAGLIST)
print(len(TAGLIST), "tags")

[nan, 'S-RATINGS_AVERAGE', 'I-SONG', 'I-PLOT', 'S-CHARACTER', 'E-YEAR', 'I-GENRE', 'I-TRAILER', 'S-RATING', 'E-CHARACTER', 'B-GENRE', 'B-REVIEW', 'E-SONG', 'B-DIRECTOR', 'E-TRAILER', 'E-REVIEW', 'E-GENRE', 'E-RATING', 'S-PLOT', 'I-TITLE', 'S-YEAR', 'I-RATINGS_AVERAGE', 'I-RATING', 'I-CHARACTER', 'B-PLOT', 'I-REVIEW', 'S-SONG', 'I-DIRECTOR', 'E-RATINGS_AVERAGE', 'I-ACTOR', 'E-DIRECTOR', 'S-REVIEW', 'B-RATING', 'B-ACTOR', 'E-TITLE', 'E-PLOT', 'B-TRAILER', 'S-DIRECTOR', 'S-ACTOR', 'E-ACTOR', 'B-YEAR', 'I-YEAR', 'B-TITLE', 'B-SONG', 'S-TITLE', 'B-CHARACTER', 'S-GENRE', 'S-TRAILER', 'B-RATINGS_AVERAGE', 'O']
50 tags


In [98]:
trainSentences, trainLabels =  loadDataset("train", TAGLIST)
testSentences, testLabels =  loadDataset("test", TAGLIST)

print(trainSentences[0])
print(trainLabels[0])

['what', 'movies', 'star', 'bruce', 'willis']
[49, 49, 49, 33, 39]


In [103]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [106]:
trainEncodings = tokenizer(trainSentences, is_split_into_words=True, padding=True)
testEncodings = tokenizer(testSentences, is_split_into_words=True, padding=True)

# Aligning labels with subwords.
trainLabels = alignLabels(trainLabels, trainEncodings)
testLabels = alignLabels(testLabels, testEncodings)

In [107]:
trainDataset = MyPOSTaggingDataset(trainEncodings, trainLabels)
testDataset = MyPOSTaggingDataset(testEncodings, testLabels)

In [109]:
# Loading the model.
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(TAGSET))
model.to(device) # Send to GPU if available.
# Pay attention on the log messege:
# "Some weights of BertForTokenClassification were not initialized from the model checkpoint and are newly initialized."
# "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
# It means that this model is not ready to use and needs fine-tuning.
print()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas




In [110]:
# Print the architecture
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [111]:
# https://huggingface.co/docs/transformers/master/en/main_classes/trainer#transformers.TrainingArguments
trainingArgs = TrainingArguments(
  output_dir = "MIT_movies_fixed_checkpoints",
  evaluation_strategy = "epoch",
  logging_strategy = "epoch",
  save_strategy = "epoch",
  overwrite_output_dir = True,
  per_device_train_batch_size = 16, # 16 or 32 is recommended.
  per_device_eval_batch_size = 1,
  learning_rate = 2e-5, # 5e-5, 2e-5, or 1e-5 is recommended.
  weight_decay = 0.01, # 0, 0.01, 0.05, 0.1, 0.15, or 0.2.
  num_train_epochs = 4,
  logging_steps = 1, # To print training loss in each epoch.
  load_best_model_at_end = True,
  metric_for_best_model = "f1",
  greater_is_better = True,
)

def computeMetrics(evalPreds):
  predictions, labels = evalPreds
  predictions = np.argmax(predictions, axis=2)

  # Removing predictions that correspond to label -100.
  cleanPreds = []
  cleanLabels = []
  for pred, label in zip(predictions, labels): # Loop over sentences.
    for p, l in zip(pred, label): # Loop over tokens.
      if l != -100: # If not -100, then pool.
        cleanPreds.append(p)
        cleanLabels.append(l)

  accuracy = accuracy_score(cleanLabels, cleanPreds)
  precision = precision_score(cleanLabels, cleanPreds, average="macro")
  recall = recall_score(cleanLabels, cleanPreds, average="macro")
  f1Score = f1_score(cleanLabels, cleanPreds, average="macro")

  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1Score}

# https://huggingface.co/docs/transformers/master/en/main_classes/trainer
trainer = Trainer(
  model = model,
  args = trainingArgs,
  train_dataset = trainDataset,
  eval_dataset = testDataset,
  compute_metrics = computeMetrics,
  # The default optimizer used by the trainer is AdamW. So, no need to specify.
  # If you want to change the optimizer, please read https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.optimizers .
)

trainer.train()

***** Running training *****
  Num examples = 9774
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2444


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5591,0.305919,0.937832,0.647339,0.585156,0.605674
2,0.214,0.271524,0.944195,0.650643,0.615661,0.629625


***** Running Evaluation *****
  Num examples = 2442
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to MIT_movies_fixed_checkpoints/checkpoint-611
Configuration saved in MIT_movies_fixed_checkpoints/checkpoint-611/config.json
Model weights saved in MIT_movies_fixed_checkpoints/checkpoint-611/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2442
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to MIT_movies_fixed_checkpoints/checkpoint-1222
Configuration saved in MIT_movies_fixed_checkpoints/checkpoint-1222/config.json
Model weights saved in MIT_movies_fixed_checkpoints/checkpoint-1222/pytorch_model.bin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5591,0.305919,0.937832,0.647339,0.585156,0.605674
2,0.214,0.271524,0.944195,0.650643,0.615661,0.629625
3,0.1584,0.266187,0.946545,0.679418,0.640058,0.652146
4,0.1282,0.263117,0.947112,0.682068,0.648247,0.658057


***** Running Evaluation *****
  Num examples = 2442
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to MIT_movies_fixed_checkpoints/checkpoint-1833
Configuration saved in MIT_movies_fixed_checkpoints/checkpoint-1833/config.json
Model weights saved in MIT_movies_fixed_checkpoints/checkpoint-1833/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2442
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to MIT_movies_fixed_checkpoints/checkpoint-2444
Configuration saved in MIT_movies_fixed_checkpoints/checkpoint-2444/config.json
Model weights saved in MIT_movies_fixed_checkpoints/checkpoint-2444/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from MIT_movies_fixed_checkpoints/checkpoint-2444 (score: 0.6580568786206757).


TrainOutput(global_step=2444, training_loss=0.2649374991509021, metrics={'train_runtime': 27852.005, 'train_samples_per_second': 1.404, 'train_steps_per_second': 0.088, 'total_flos': 1177705826488800.0, 'train_loss': 0.2649374991509021, 'epoch': 4.0})

In [112]:
text = "there any good romantic comedies out right now"
# text = "what movies star bruce willis show me films with drew barrymore from the 1980s"
encodedData = tokenizer(text, return_tensors="pt")
encodedData.to(device)

model.eval() # IMPORTANT! Set the model as evaluation mode.
with torch.no_grad(): # IMPORTANT! Do not computing gradient!
  outputs = model(encodedData["input_ids"], attention_mask=encodedData["attention_mask"]) # Feed forward. Without calculating loss.

logits = outputs.logits.detach().cpu() # Getting logits, moving to CPU.
predictions = torch.argmax(logits, dim=2).numpy() # Getting most probable prediction.
predictions = list(predictions[0])
print("Predictions: ", predictions)

wordIndices = encodedData.word_ids()
print("Word Indices: ", wordIndices)

subwords = tokenizer.convert_ids_to_tokens(encodedData["input_ids"].tolist()[0])
print("Subwords: ", subwords)
print()

lastWordIndex = None
lastWord = None
lastTag = None
for index, (wordIndex, subword, tagIndex) in enumerate(zip(wordIndices, subwords, predictions)):
  if index == 0:
    continue

  if lastWordIndex == wordIndex:
    if subword.startswith("##"):
      subword = subword[2:]
    lastWord += subword
  else:
    if lastWord != None:
      print(lastWord, lastTag)
    lastWord = subword
    lastTag = TAGLIST[tagIndex]

  lastWordIndex = wordIndex

Predictions:  [49, 49, 49, 49, 10, 16, 49, 49, 49, 49]
Word Indices:  [None, 0, 1, 2, 3, 4, 5, 6, 7, None]
Subwords:  ['[CLS]', 'there', 'any', 'good', 'romantic', 'comedies', 'out', 'right', 'now', '[SEP]']

there O
any O
good O
romantic B-GENRE
comedies E-GENRE
out O
right O
now O
