### Importing libraries

In [52]:
import pandas as pd
from datasets import Dataset

### Colab mount for GPU support

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Reading train data file

In [7]:
df = pd.DataFrame(columns=["Sentence", "tag", "token"])
train = open("/content/drive/MyDrive/Untitled folder/ner_train.txt", "r")

idx = 0
sent = 0
for line in train:
    if line=="\n":
        sent = sent+1
    else:
        val = line.split("\t")
        df = pd.concat([df, pd.DataFrame({"Sentence": "Sentence:" + str(sent) , "tag": val[0], "token": val[1][:-1]}, index=[idx])], axis=0)
        idx=idx+1

df.head()

Unnamed: 0,Sentence,tag,token
0,Sentence:0,O,what
1,Sentence:0,O,movies
2,Sentence:0,O,star
3,Sentence:0,B-ACTOR,bruce
4,Sentence:0,I-ACTOR,willis


In [54]:
# cheking null values
df.isna().sum(axis=0)

Sentence    0
tag         0
token       0
dtype: int64

In [55]:
# checkiing value counts of ner tags
df['tag'].value_counts()

O                    61008
B-GENRE               4354
I-TITLE               3495
I-ACTOR               3474
B-ACTOR               3220
B-YEAR                2858
I-YEAR                2456
B-TITLE               2376
B-RATING              2007
B-PLOT                1927
B-RATINGS_AVERAGE     1869
I-DIRECTOR            1850
B-DIRECTOR            1720
I-PLOT                1687
I-RATINGS_AVERAGE     1673
I-RATING               840
I-GENRE                786
I-SONG                 446
B-CHARACTER            385
I-CHARACTER            342
B-SONG                 245
B-REVIEW               221
I-REVIEW               132
B-TRAILER              113
I-TRAILER                7
Name: tag, dtype: int64

### Generating NER tag label

In [56]:
# creating ner tags dict to generate labels
label_list = df['tag'].unique()

label_encoding_dict = {}
for idx, val in enumerate(label_list):
    label_encoding_dict[val] = idx

In [57]:
label_list

array(['O', 'B-ACTOR', 'I-ACTOR', 'B-YEAR', 'B-TITLE', 'B-GENRE',
       'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG',
       'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER',
       'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE',
       'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR'],
      dtype=object)

In [58]:
label_encoding_dict

{'O': 0,
 'B-ACTOR': 1,
 'I-ACTOR': 2,
 'B-YEAR': 3,
 'B-TITLE': 4,
 'B-GENRE': 5,
 'I-GENRE': 6,
 'B-DIRECTOR': 7,
 'I-DIRECTOR': 8,
 'B-SONG': 9,
 'I-SONG': 10,
 'B-PLOT': 11,
 'I-PLOT': 12,
 'B-REVIEW': 13,
 'B-CHARACTER': 14,
 'I-CHARACTER': 15,
 'B-RATING': 16,
 'B-RATINGS_AVERAGE': 17,
 'I-RATINGS_AVERAGE': 18,
 'I-TITLE': 19,
 'I-RATING': 20,
 'B-TRAILER': 21,
 'I-TRAILER': 22,
 'I-REVIEW': 23,
 'I-YEAR': 24}

### Train data pre-processing

In [59]:
train_df = df
train_df = train_df.groupby(["Sentence"], sort=False).agg(list)
train_df.head()

Unnamed: 0_level_0,tag,token
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
Sentence:0,"[O, O, O, B-ACTOR, I-ACTOR]","[what, movies, star, bruce, willis]"
Sentence:1,"[O, O, O, O, B-ACTOR, I-ACTOR, O, O, B-YEAR]","[show, me, films, with, drew, barrymore, from,..."
Sentence:2,"[O, O, O, O, B-ACTOR, I-ACTOR, O, B-ACTOR, I-A...","[what, movies, starred, both, al, pacino, and,..."
Sentence:3,"[O, O, O, O, O, O, O, O, B-ACTOR, I-ACTOR, O, ...","[find, me, all, of, the, movies, that, starred..."
Sentence:4,"[O, O, O, O, O, O, O, O, O, O, O]","[find, me, a, movie, with, a, quote, about, ba..."


In [60]:
train_df.shape

(9775, 2)

In [61]:
# generating train dataset for input to bert ner model
train_dataset = Dataset.from_pandas(train_df)
train_dataset

Dataset({
    features: ['tag', 'token', 'Sentence'],
    num_rows: 9775
})

### Importing BERT tokenizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [63]:
train_dataset[2]

{'tag': ['O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'B-ACTOR', 'I-ACTOR'],
 'token': ['what',
  'movies',
  'starred',
  'both',
  'al',
  'pacino',
  'and',
  'robert',
  'deniro'],
 'Sentence': 'Sentence:2'}

In [64]:
# checking weather after tokenizing there is mismatch bw ner tags and tokens since BERT uses word-piece tokenizer
len(tokenizer(train_dataset[2]['token'], truncation=True, is_split_into_words=True)['input_ids']) == len(train_dataset[2]['tag'])

False

### Generating function to match token length with NER tags length

In [65]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["token"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            # elif label[word_idx] == '0':
            #     label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [66]:
train_tokenized_datasets[2]

{'tag': ['O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'B-ACTOR', 'I-ACTOR'],
 'token': ['what',
  'movies',
  'starred',
  'both',
  'al',
  'pacino',
  'and',
  'robert',
  'deniro'],
 'Sentence': 'Sentence:2',
 'input_ids': [101,
  2054,
  5691,
  5652,
  2119,
  2632,
  14397,
  5740,
  1998,
  2728,
  7939,
  9711,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, -100]}

In [67]:
# checking is mistmatch is corrected
len(train_tokenized_datasets[2]['input_ids']) == len(train_tokenized_datasets[2]['labels'])

True

### Importing token classification libraries

In [68]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [69]:
#check if gpu is present
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

#### Model initilization

In [70]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
model.to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24"
  },
  "initializer_range": 0.02,
  "lab

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

### Importing trainer and metric libraries

In [71]:
from datasets import load_metric
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np

In [72]:
task = "ner" 
batch_size = 50

In [73]:
# generating metric function
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

In [74]:
# testing metric function
example = train_tokenized_datasets[2]
labels = [label_list[i] for i in example["labels"] if i!=-100]
metric.compute(predictions=[labels], references=[labels])

{'ACTOR': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

### Model trainer initialization

In [75]:
logging_steps = len(train_tokenized_datasets) 

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    logging_steps=logging_steps
)

data_collator = DataCollatorForTokenClassification(tokenizer)
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=train_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Training model

In [76]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tag, token, Sentence. If tag, token, Sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9775
  Num Epochs = 10
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 50
  Gradient Accumulation steps = 1
  Total optimization steps = 1960
  Number of trainable parameters = 66382105
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.22032,0.874856,0.898179,0.886364,0.95108
2,No log,0.150397,0.912315,0.924881,0.918555,0.963496
3,No log,0.108478,0.936645,0.943175,0.939899,0.974575
4,No log,0.080275,0.952239,0.956906,0.954567,0.981305
5,No log,0.058504,0.965937,0.965651,0.965794,0.986144
6,No log,0.043644,0.972965,0.973129,0.973047,0.989457
7,No log,0.03378,0.979911,0.978918,0.979414,0.991952
8,No log,0.026642,0.981937,0.985297,0.983614,0.993985
9,No log,0.022474,0.985626,0.987874,0.986749,0.995209
10,No log,0.020825,0.986601,0.989311,0.987954,0.995557


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tag, token, Sentence. If tag, token, Sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9775
  Batch size = 50
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tag, token, Sentence. If tag, token, Sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9775
  Batch size = 50
Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens

TrainOutput(global_step=1960, training_loss=0.13199972035933513, metrics={'train_runtime': 350.8692, 'train_samples_per_second': 278.594, 'train_steps_per_second': 5.586, 'total_flos': 657685601797500.0, 'train_loss': 0.13199972035933513, 'epoch': 10.0})

### Evaluating model on train data

In [77]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tag, token, Sentence. If tag, token, Sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9775
  Batch size = 50


{'eval_loss': 0.02082541212439537,
 'eval_precision': 0.9866014999578664,
 'eval_recall': 0.9893109130085767,
 'eval_f1': 0.9879543488808725,
 'eval_accuracy': 0.99555697799198,
 'eval_runtime': 9.8694,
 'eval_samples_per_second': 990.439,
 'eval_steps_per_second': 19.859,
 'epoch': 10.0}

### Reading test data and preprocessing

In [78]:
test_df = pd.DataFrame(columns=["Sentence", "tag", "token"])
test = open(r"/content/drive/MyDrive/Untitled folder/ner_test.txt", "r")

idx = 0
sent = 0
for line in test:
    if line=="\n":
        sent = sent+1
    else:
        val = line.split("\t")
        test_df = pd.concat([test_df, pd.DataFrame({"Sentence": "Sentence:" + str(sent) , "tag": val[0], "token": val[1][:-1]}, index=[idx])], axis=0)
        idx=idx+1

test_df.head()

Unnamed: 0,Sentence,tag,token
0,Sentence:0,O,are
1,Sentence:0,O,there
2,Sentence:0,O,any
3,Sentence:0,O,good
4,Sentence:0,B-GENRE,romantic


In [79]:
test_df.isna().sum(axis=0)

Sentence    0
tag         0
token       0
dtype: int64

In [80]:
test_df = test_df.groupby(["Sentence"], sort=False).agg(list)
test_df.head()

Unnamed: 0_level_0,tag,token
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
Sentence:0,"[O, O, O, O, B-GENRE, I-GENRE, O, B-YEAR, I-YEAR]","[are, there, any, good, romantic, comedies, ou..."
Sentence:1,"[O, O, O, O, O, B-PLOT, I-PLOT, I-PLOT]","[show, me, a, movie, about, cars, that, talk]"
Sentence:2,"[O, O, B-RATINGS_AVERAGE, I-RATINGS_AVERAGE, O...","[list, the, five, star, rated, movies, starrin..."
Sentence:3,"[O, B-GENRE, I-GENRE, O, O, O, O, B-YEAR]","[what, science, fiction, films, have, come, ou..."
Sentence:4,"[O, O, O, O, O, O, O, O, B-TITLE, I-TITLE, O]","[did, the, same, director, make, all, of, the,..."


In [81]:
test_df.shape

(2443, 2)

In [82]:
# checking if test data has any new tags not present in train data.
# No new tags found
for i in range(test_df.shape[0]):
    if set(test_df['tag'][i]).issubset(set(label_list)):
        continue
    else:
        print("new tag in test data")

In [83]:
# generating test dataset
test_dataset = Dataset.from_pandas(test_df)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

### Evaluating model on test data

In [84]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tag, token, Sentence. If tag, token, Sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2443
  Batch size = 50


In [85]:
test_df_eval = pd.DataFrame(columns=['tag', 'precision', 'recall', 'f1-score'])

j=0
for key,value in results.items():
    if key not in ['overall_precision','overall_recall','overall_f1', 'overall_accuracy']:
        test_df_eval = pd.concat([test_df_eval, pd.DataFrame({'tag':key, 'precision': round(value['precision'],2),
                                                            'recall': round(value['recall'],2), 'f1-score':round(value['f1'],2)}, index=[j])], axis=0)
        j=j+1
    else:
        print(key,round(value,2))
test_df_eval

overall_precision 0.87
overall_recall 0.88
overall_f1 0.87
overall_accuracy 0.94


Unnamed: 0,tag,precision,recall,f1-score
0,ACTOR,0.9,0.95,0.92
1,CHARACTER,0.64,0.67,0.66
2,DIRECTOR,0.93,0.86,0.89
3,GENRE,0.9,0.93,0.92
4,PLOT,0.69,0.72,0.71
5,RATING,0.94,0.94,0.94
6,RATINGS_AVERAGE,0.87,0.86,0.86
7,REVIEW,0.27,0.26,0.27
8,SONG,0.62,0.66,0.64
9,TITLE,0.84,0.87,0.85
