In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
import torch
import torch.nn.functional as F
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score

# Dataset Loading and Exploration

In this section, we will first load the dataset, generate two datasets for training, then explore the dataset.

We choose Hindi language. Since it's not english, have a Hugging Face BERT-base model, and it the dataset contains at least 7000 sentences, therefore it's a valid choice of language.

We created 2 datasets:
1. dataset_1 with 1000 sentences
2. dataset_2 with 3000 sentences

In [3]:
# download ner dataset
# chosen language: hindi
full_dataset = load_dataset("polyglot_ner", "hi")

Downloading builder script:   0%|          | 0.00/6.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/401648 [00:00<?, ? examples/s]

In [4]:
full_dataset.keys()

dict_keys(['train'])

In [5]:
# check if chosen dataset is a valid dataset
# Hindi is not English : therefore Valid
# Hugging Face BERT-base model for the language exist (using multilinguial bert base model)
# The dataset contains at least 7000 sentences : therefore Valid
len(full_dataset["train"])

401648

In [6]:
# EXTRACT 2 DATASETS FOR TRAINING, 1 FOR EVALUATION

# extract dataset 1 with 1000 sentences
train_dataset_1 = full_dataset["train"].shuffle(seed=42).select([i for i in range(1000)])
print(len(train_dataset_1))
# extract dataset 2 with 3000 sentences
train_dataset_2 = full_dataset["train"].shuffle(seed=42).select([i for i in range(3000)])
print(len(train_dataset_2))
# an evaluation dataset
eval_dataset_1 = full_dataset["train"].shuffle(seed=42).select([i for i in range(3000, 5000)])
print(len(eval_dataset_1))

1000
3000
2000


In [7]:
train_dataset_1[0].keys()

dict_keys(['id', 'lang', 'words', 'ner'])

In [8]:
train_dataset_1[0:2]["words"]

[['ये', 'मेक्सिको', 'राष्ट्र', 'से', 'थे', '।'],
 ['2006',
  'में',
  ',',
  'किडमैन',
  'को',
  'ऑस्ट्रेलिया',
  'के',
  'सर्वोच्च',
  'नागरिक',
  'सम्मान',
  'कम्पानियन',
  'ऑफ़',
  'द',
  'ऑर्डर',
  'ऑफ़',
  'ऑस्ट्रेलिया',
  'से',
  'नवाज़ा',
  'गया',
  '.']]

In [9]:
train_dataset_1[0:2]["ner"]

[['O', 'LOC', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'PER',
  'O',
  'LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'LOC',
  'O',
  'O',
  'O',
  'O']]

In [10]:
all_labels = [label for sentence_labels in full_dataset["train"]["ner"] for label in sentence_labels]
unique_labels = set(all_labels)
print(unique_labels)
label2id = {k: v for v, k in enumerate(unique_labels)}
print(label2id)

{'O', 'LOC', 'ORG', 'PER'}
{'O': 0, 'LOC': 1, 'ORG': 2, 'PER': 3}


# Initialization and important functions

In this section, we will initialize several things and define all the important functions we wanna use for this assignment.
1. Initialize tokenizer and bert-base model
2. Define custom dataset which retuns input ids, attention masks, labels of the dataset
3. Defining dataloaders
4. Training function
5. Evaluation function


In [11]:
# Load pretrained Hugging Face BERT-base model for multilingual languages
tokenizer = BertTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
model = BertForTokenClassification.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")

tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/709M [00:00<?, ?B/s]

In [12]:
# GENERATE CUSTOM DATASET WHICH RETURNS INPUT_IDS, ATTENTION_MASKS, AND LABELS OF THE DATASET
from torch.nn.utils.rnn import pad_sequence
class CustomDataset(Dataset):
  def __init__(self, texts, labels):
    self.texts = texts
    self.labels = labels
    self.label_encoder = LabelEncoder()

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    tokenized_sentence = []
    labels = []

    # step 1: tokenize (and adapt corresponding labels)
    for word, label in zip(self.texts[idx], self.labels[idx]):
      # Tokenize the word and count # of subwords the word is broken into

      tokenized_word = tokenizer.tokenize(word)
      n_subwords = len(tokenized_word)

      # Add the tokenized word to the final tokenized word list
      tokenized_sentence.extend(tokenized_word)

      # Add the same label to the new list of labels `n_subwords` times
      labels.extend([label] * n_subwords)

    # step 2: add special tokens (and corresponding labels)
    tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
    labels.insert(0, "O") # add outside label for [CLS] token
    labels.insert(-1, "O") # add outside label for [SEP] token

    # step 3: truncating/padding
    maxlen = 128

    if (len(tokenized_sentence) > maxlen):
      # truncate
      tokenized_sentence = tokenized_sentence[:maxlen]
      labels = labels[:maxlen]
    else:
      # pad
      tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
      labels = labels + ["O" for _ in range(maxlen - len(labels))]

    # step 4: obtain the attention mask
    attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

    # step 5: convert tokens to input ids
    ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

    label_ids = [label2id[label] for label in labels]
    return {
          'ids': torch.tensor(ids, dtype=torch.long),
          'mask': torch.tensor(attn_mask, dtype=torch.long),
          'labels': torch.tensor(label_ids, dtype=torch.long)
        }

In [13]:
# RETURNS TRAIN DATALOADER AND EVALUATION DATALOADER GIVEN DATASETS
def get_dataloaders(train_data, eval_data, batch_size=8):

  train_dataset = CustomDataset(train_data["words"], train_data["ner"])
  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  eval_dataset = CustomDataset(eval_data["words"], eval_data["ner"])
  eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

  return train_dataloader, eval_dataloader

In [14]:
# FUNCTION TO TRAIN THE MODEL

# NOTE: WE ARE NOT USING DEVELOPMENT SET AS PER ASSIGNMENT REQUIREMENT
def train(train_dataloader, model, lr=1e-5, epochs=2):
  # use the GPU
  if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

  else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  # use device for training model
  model.to(device)

  # init optimizer
  optimizer = AdamW(model.parameters(), lr=lr)

  # start training for each epoch and bring model to train mode
  model.train()

  # train for each epoch
  for epoch in range(epochs):
    train_loss = 0
    all_predictions = []
    all_labels = []

    # train for each batch in dataloader
    for step, batch in enumerate(train_dataloader):
      b_labels = batch["labels"].to(device)
      optimizer.zero_grad()
      outputs = model(input_ids=batch['ids'].to(device), attention_mask=batch['mask'].to(device), labels=batch['labels'].to(device))
      logits = outputs.logits
      loss = outputs.loss
      train_loss += loss

      predictions = torch.argmax(logits, dim=2)
      all_predictions.append(predictions.cpu().numpy())
      all_labels.append(b_labels.cpu().numpy())

      loss.backward()
      optimizer.step()

    # Concatenate predictions and labels from all batches
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Calculate and print metrics for current epoch
    train_accuracy = accuracy_score(all_predictions.flatten(), all_labels.flatten())
    train_f1_macro = f1_score(all_predictions.flatten(), all_labels.flatten(), average='macro')
    train_f1_micro = f1_score(all_predictions.flatten(), all_labels.flatten(), average='micro')

    # print metrics for current epoch
    print(f"epoch: {epoch}, train_loss: {train_loss}, train_accuracy: {train_accuracy}, f1_macro: {train_f1_macro}, f1_micro: {train_f1_micro}")

  # return model to evaluate
  return model

In [15]:
# FUNCTION TO EVALUATE THE TRAINED MODEL USING EVALUATION DATASET
def evaluate(eval_dataloader, model):
  if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

  else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  # use device for training model
  model.to(device)
  model.eval()
  all_predictions = []
  all_labels = []
  with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
      outputs = model(input_ids=batch['ids'].to(device), attention_mask=batch['mask'].to(device), labels=batch['labels'].to(device))
      logits = outputs.logits
      b_labels = batch["labels"].to(device)
      predictions = torch.argmax(logits, dim=2)
      all_predictions.append(predictions.cpu().numpy())
      all_labels.append(b_labels.cpu().numpy())
    # Concatenate predictions and labels from all batches
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Calculate and print metrics for evalutation
    eval_accuracy = accuracy_score(all_predictions.flatten(), all_labels.flatten())
    eval_f1_macro = f1_score(all_predictions.flatten(), all_labels.flatten(), average='macro')
    eval_f1_micro = f1_score(all_predictions.flatten(), all_labels.flatten(), average='micro')

    # print metrics for evaluation
    print(f"eval_accuracy: {eval_accuracy}, eval_f1_macro: {eval_f1_macro}, eval_f1_micro: {eval_f1_micro}")

# Training 3 fined-tuned versions

In this section, we will use all the functions above to train 3 fined-tuned versions of NER Bert Model:
1. Fine-tuned with 1,000 sentences
2. Fine-tuned with 3,000 sentences
3. Fine-tuned with 3,000 sentences and frozen embeddings

We will predict each fined-tuned model with evaluation set.

## Finetuning Bert model with 1000 sentences

In [18]:
# train model on dataset 1 with 1000 sentences on training set

# get dataloaders from train dataset 1 and evaluation dataset
train_dataloader, eval_dataloader = get_dataloaders(train_dataset_1, eval_dataset_1)

# train model
model = train(train_dataloader, model, epochs=6)

# evaluate the model
evaluate(eval_dataloader, model)

There are 1 GPU(s) available.
Device name: Tesla T4




epoch: 0, train_loss: 4.48781681060791, train_accuracy: 0.98921875, f1_macro: 0.20973515268334317, f1_micro: 0.98921875
epoch: 1, train_loss: 2.337453603744507, train_accuracy: 0.9933125, f1_macro: 0.4445414159458833, f1_micro: 0.9933125
epoch: 2, train_loss: 1.488467812538147, train_accuracy: 0.995765625, f1_macro: 0.8047583285016608, f1_micro: 0.995765625
epoch: 3, train_loss: 0.8977278470993042, train_accuracy: 0.997671875, f1_macro: 0.9012323893542589, f1_micro: 0.997671875
epoch: 4, train_loss: 0.6572772264480591, train_accuracy: 0.9984765625, f1_macro: 0.9411640863247235, f1_micro: 0.9984765625
epoch: 5, train_loss: 0.4071134030818939, train_accuracy: 0.9988984375, f1_macro: 0.9555252189599277, f1_micro: 0.9988984375
There are 1 GPU(s) available.
Device name: Tesla T4
eval_accuracy: 0.99045703125, eval_f1_macro: 0.44095608438072587, eval_f1_micro: 0.99045703125


## Finetuning Bert model with 3000 sentences

In [19]:
# train model on dataset 2 with 3000 sentences on training set

# Load pretrained Hugging Face BERT-base model for multilingual languages
tokenizer = BertTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
model = BertForTokenClassification.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")

# get dataloaders from train dataset 2 and evaluation dataset
train_dataloader, eval_dataloader = get_dataloaders(train_dataset_2, eval_dataset_1)

# train model
model = train(train_dataloader, model, epochs=6)

# evaluate the model
evaluate(eval_dataloader, model)

There are 1 GPU(s) available.
Device name: Tesla T4




epoch: 0, train_loss: 19.628273010253906, train_accuracy: 0.9878776041666667, f1_macro: 0.19468347617135873, f1_micro: 0.9878776041666667
epoch: 1, train_loss: 7.3751091957092285, train_accuracy: 0.99284375, f1_macro: 0.6350849988299856, f1_micro: 0.99284375
epoch: 2, train_loss: 5.00184965133667, train_accuracy: 0.9947526041666667, f1_macro: 0.7572710605080594, f1_micro: 0.9947526041666667
epoch: 3, train_loss: 3.642319679260254, train_accuracy: 0.9963489583333334, f1_macro: 0.5663325381305108, f1_micro: 0.9963489583333334
epoch: 4, train_loss: 2.321362018585205, train_accuracy: 0.9979036458333334, f1_macro: 0.7282801883041966, f1_micro: 0.9979036458333334
epoch: 5, train_loss: 1.621988296508789, train_accuracy: 0.9984921875, f1_macro: 0.9313581481315093, f1_micro: 0.9984921875
There are 1 GPU(s) available.
Device name: Tesla T4
eval_accuracy: 0.991921875, eval_f1_macro: 0.45527607804333003, eval_f1_micro: 0.991921875


## Finetuning Bert model with 3000 sentences and frozen embeddings



In [20]:
# train model on dataset 2 with 3000 sentences on training set and frozen embeddings

# Load pretrained Hugging Face BERT-base model for multilingual languages
tokenizer = BertTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
model = BertForTokenClassification.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")

# Freeze the embeddings
for param in model.base_model.embeddings.parameters():
    param.requires_grad = False

# Verify that embeddings are frozen
for param in model.base_model.embeddings.parameters():
    print(param.requires_grad)

# get dataloaders from train dataset 2 and evaluation dataset
train_dataloader, eval_dataloader = get_dataloaders(train_dataset_2, eval_dataset_1)

# train model
model = train(train_dataloader, model, epochs=6)

# evaluate the model
evaluate(eval_dataloader, model)

False
False
False
False
False
There are 1 GPU(s) available.
Device name: Tesla T4




epoch: 0, train_loss: 22.360986709594727, train_accuracy: 0.986765625, f1_macro: 0.18373141043049587, f1_micro: 0.986765625
epoch: 1, train_loss: 7.928766250610352, train_accuracy: 0.9921822916666667, f1_macro: 0.46194067171802056, f1_micro: 0.9921822916666667
epoch: 2, train_loss: 5.579566955566406, train_accuracy: 0.9943385416666667, f1_macro: 0.6053125316950325, f1_micro: 0.9943385416666667
epoch: 3, train_loss: 4.060036659240723, train_accuracy: 0.9958802083333333, f1_macro: 0.6680206646277165, f1_micro: 0.9958802083333333
epoch: 4, train_loss: 2.8042120933532715, train_accuracy: 0.9973567708333333, f1_macro: 0.7126156864467613, f1_micro: 0.9973567708333333
epoch: 5, train_loss: 2.1283164024353027, train_accuracy: 0.9980286458333333, f1_macro: 0.9211616734861318, f1_micro: 0.9980286458333333
There are 1 GPU(s) available.
Device name: Tesla T4
eval_accuracy: 0.99151953125, eval_f1_macro: 0.4442169033415507, eval_f1_micro: 0.99151953125
