In [None]:
!git clone https://github.com/leslie-huang/UN-named-entity-recognition #Load the dataset

Cloning into 'UN-named-entity-recognition'...
remote: Enumerating objects: 21580, done.[K
remote: Total 21580 (delta 0), reused 0 (delta 0), pack-reused 21580[K
Receiving objects: 100% (21580/21580), 14.70 MiB | 17.64 MiB/s, done.
Resolving deltas: 100% (21095/21095), done.


In [None]:
from google.colab import drive
drive.mount('/content/drive')#only execute if you're connecting to drive

Mounted at /content/drive


In [None]:
#install required packages
%pip install transformers
%pip install datasets
%pip install seqeval
%pip install un_ner_tokens

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
#Preprocessing the train and test datasets 
import pandas as pd
import os
import pandas as pd
import itertools
import os
from datasets import Dataset
from datasets import load_dataset

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})

def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)

def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

In [None]:
#concat all the training data tokens and their ner tags together
directory="/content/UN-named-entity-recognition/tagged-training"
pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)

Unnamed: 0,tokens,ner_tags
0,"[Let, me, congratulate, Mr., Sam, Kutesa, of, ...","[O, O, O, O, I-PER, I-PER, O, I-LOC, O, O, O, ..."
1,"[I, also, extend, my, warmest, gratitude, to, ...","[O, O, O, O, O, O, O, O, I-PER, I-PER, O, O, O..."
2,"[Over, the, past, few, months, ,, Bulgaria, an...","[O, O, O, O, O, O, I-LOC, O, O, I-LOC, O, O, O..."
3,"[Thousands, of, people, lost, their, homes, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[The, heavy, storms, were, no, longer, news, b...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
3652,"[We, have, been, addressing, such, matters, in...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3653,"[The, celebration, in, 2015, of, the, seventie...","[O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG, O..."
3654,"[We, have, the, joint, responsibility, to, wor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3655,"[This, powerful, juncture, ,, at, which, we, b...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
train_dataset, test_dataset = get_un_token_dataset('/content/UN-named-entity-recognition/tagged-training/', '/content/UN-named-entity-recognition/tagged-test/')

In [None]:
#convert from word tokens to integer tokens, to make data compatible to finetune the model
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification #Hugging face Transformers library that is used for collating data for token classification tasks
import numpy as np

import torch
print(torch.cuda.is_available())

#ner labels 
label_list = [
    'O',       # Outside of a named entity
    'B-MISC',  # Beginning of a miscellaneous entity right after another miscellaneous entity
    'I-MISC',  # Miscellaneous entity
    'B-PER',   # Beginning of a person's name right after another person's name
    'I-PER',   # Person's name
    'B-ORG',   # Beginning of an organisation right after another organisation
    'I-ORG',   # Organisation
    'B-LOC',   # Beginning of a location right after another location
    'I-LOC'    # Location
]
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3, 'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

True


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3657 [00:00<?, ? examples/s]

Map:   0%|          | 0/2074 [00:00<?, ? examples/s]

In [None]:
train_dataset #preview

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3657
})

In [None]:
pd.DataFrame(train_tokenized_datasets) #preview

Unnamed: 0,tokens,ner_tags,input_ids,attention_mask,labels
0,"[Let, me, congratulate, Mr., Sam, Kutesa, of, ...","[O, O, O, O, I-PER, I-PER, O, I-LOC, O, O, O, ...","[101, 2292, 2033, 26478, 8609, 9869, 2720, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 0, 8, ..."
1,"[I, also, extend, my, warmest, gratitude, to, ...","[O, O, O, O, O, O, O, O, I-PER, I-PER, O, O, O...","[101, 1045, 2036, 7949, 2026, 4010, 4355, 1553...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, ..."
2,"[Over, the, past, few, months, ,, Bulgaria, an...","[O, O, O, O, O, O, I-LOC, O, O, I-LOC, O, O, O...","[101, 2058, 1996, 2627, 2261, 2706, 1010, 8063...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 8, 0, 0, 8, 0, 0, 0, ..."
3,"[Thousands, of, people, lost, their, homes, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[101, 5190, 1997, 2111, 2439, 2037, 5014, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[The, heavy, storms, were, no, longer, news, b...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[101, 1996, 3082, 12642, 2020, 2053, 2936, 273...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
3652,"[We, have, been, addressing, such, matters, in...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[101, 2057, 2031, 2042, 12786, 2107, 5609, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3653,"[The, celebration, in, 2015, of, the, seventie...","[O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG, O...","[101, 1996, 7401, 1999, 2325, 1997, 1996, 2698...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, ..."
3654,"[We, have, the, joint, responsibility, to, wor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[101, 2057, 2031, 1996, 4101, 5368, 2000, 2147...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3655,"[This, powerful, juncture, ,, at, which, we, b...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[101, 2023, 3928, 12022, 14890, 1010, 2012, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
#Getting prediction of labels from the pretrained model directly.
#These predictions are not accurate, indicating the need to finetune this model for our purpose, UN entity recognition
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

paragraph = 'Congratulations to Mr. Johnson on his assumption of the Presidency of the General Assembly in Switzerland at its sixty-sixth session.'
tokens = tokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
preds = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in preds]

words = tokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'words': words,'ner': predictions })

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

Unnamed: 0,words,ner
0,[CLS],B-PER
1,congratulations,B-PER
2,to,B-PER
3,mr,B-PER
4,.,B-PER
5,johnson,B-PER
6,on,B-PER
7,his,B-PER
8,assumption,B-PER
9,of,B-ORG


In [None]:
#Finetune with trainer(), hyperparameters passed as args
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.0001,
)

#sequeval calculates the metrics required to evaluate the NER model, namely Precision, Recall, F1 score and overall accuracy
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics

)


trainer.train()
trainer.evaluate()
trainer.save_model('ner_for_UN.model')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.073887,0.677806,0.733426,0.70452,0.976387
2,No log,0.057002,0.769365,0.819657,0.793715,0.982329
3,0.152400,0.054578,0.795614,0.840983,0.81767,0.98392
4,0.152400,0.052029,0.799567,0.856282,0.826953,0.984707
5,0.030400,0.055566,0.801483,0.851646,0.825804,0.984462
6,0.030400,0.05748,0.815872,0.848401,0.831818,0.984462
7,0.019300,0.059134,0.813051,0.854891,0.833446,0.984357
8,0.019300,0.06108,0.815141,0.8586,0.836306,0.984776
9,0.014200,0.062187,0.820115,0.858136,0.838695,0.984549
10,0.014200,0.062125,0.818463,0.859064,0.838272,0.984637


In [None]:
# Using the finetuned model to predict the Named entities in the same sentence used above
#This model identifies the named entities accurately, indicating the successfully finetuned NER model for our particular text/dataset
tokenizer = AutoTokenizer.from_pretrained('./ner_for_UN.model/')

paragraph = 'Congratulations to Mr. Johnson on his assumption of the Presidency of the General Assembly in Switzerland at its sixty-sixth session.'
tokens = tokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

model = AutoModelForTokenClassification.from_pretrained('./ner_for_UN.model/', num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
preds = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in preds]

words = tokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'words': words,'ner': predictions })

Unnamed: 0,words,ner
0,[CLS],O
1,congratulations,O
2,to,O
3,mr,O
4,.,O
5,johnson,I-PER
6,on,O
7,his,O
8,assumption,O
9,of,O
