## **Named Entity Recognition**



* NER is a common NLP task that identifies entities like people, organizations or locations in text. These entities can be used for various applications such as gaining insights from documents, augmenting the quality of search engines, or building a structured database from a corpus.



### **Initialization**

In [None]:
#@ INITIALIZATION:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### **Downloading Libraries and Dependencies**

In [None]:
#@ IMPORTING MODULES: UNCOMMENT BELOW:
!pip install transformers
!pip install datasets
!pip install seqeval
import nltk
nltk.download('punkt')
import torch
import pandas as pd
import glob
import json
import copy
import datasets
import numpy as np
import os
from datasets import Dataset
from seqeval.metrics import f1_score
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModel
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback

# #@ IGNORING WARNINGS:
import warnings
warnings.filterwarnings("ignore")

### **The Dataset**

In [None]:
input_path = '/content/drive/MyDrive/master thesis on NLP/Colab Notebook/Dataset/disaster_data/dataset_015_to_024_bipad/Social Media Types/datasets/1000_Dataset/'
output_path = '/content/drive/MyDrive/master thesis on NLP/Colab Notebook/Dataset/disaster_data/dataset_015_to_024_bipad/Social Media Types/datasets/results/ner_results/'

In [None]:
zip_file = 'data1.zip'
jsonl_file = 'all.jsonl'
path_zip_file = os.path.join(input_path, zip_file)
data_path = os.path.join(input_path, jsonl_file)

In [None]:
import zipfile
with zipfile.ZipFile(path_zip_file, 'r') as zip_ref:
    zip_ref.extractall(path_zip_file)

**Getting the dataset**

In [None]:
json_lines = []
with open(data_path, 'r') as f:
    json_lines.extend(f.readlines())
len(json_lines)

1002

### **Processing Dataset**

In [None]:
#@ VISUALIZE THE DATASET
import json

import json

def visualize_jsonl(jsonl_file_path, num_objects):
    with open(jsonl_file_path, 'r') as file:
        # Counter to keep track of the number of JSON objects processed
        count = 0
        # Read each line of the JSONL file
        for line in file:
            # Load JSON data from the line
            data = json.loads(line)
            # Print the JSON data in a readable format
            print(json.dumps(data, indent=4))
            # Increment the counter
            count += 1
            # Break the loop if the desired number of JSON objects is reached
            if count == num_objects:
                break
visualize_jsonl(data_path, num_objects=3)

{
    "id": 2805,
    "text": "A quick comparison of the ongoing 2020 M6.4 Puerto Rico earthquake sequence and the 2003 M6.5 Central California sequenc\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a6",
    "label": [
        [
            44,
            55,
            "LOC"
        ],
        [
            56,
            66,
            "HAZ"
        ],
        [
            84,
            88,
            "DATE"
        ],
        [
            94,
            113,
            "LOC"
        ]
    ],
    "Comments": []
}
{
    "id": 2806,
    "text": "\u00c3\u00b0\u00c5\u00b8\u00c5\u00a1\u00c2\u00a8 Earthquake Alert \u00c3\u00b0\u00c5\u00b8\u00c5\u00a1\u00c2\u00a8 \n\n\u00c3\u00b0\u00c5\u00b8\u00e2\u20ac\u009d\u00c2\u00b4 Incident: Earthquake \n\u00c3\u00b0\u00c5\u00b8\u00e2\u20ac\u0153\u00e2\u20ac\u00a6 Date: 4/25/2015 \n\u00c3\u00b0\u00c5\u00b8\u00e2\u20ac\u0153\u00c2\u008d Location: Shey Phoksundo Rural Municipality-99, Dolpa, Karnali \n\nStay safe everyone! Our thoughts are with the a

In [None]:
#@ FUNCTION FOR PROCESSING THE DATASET:
def convert_to_tokens(json_string):
    val = json.loads(json_string)
    text = val['text']
    tags = []
    indices = []
    tokens, tags = [], []
    final_list = []
    start = 0
    for itm in val['label']:
        if itm[2] == "LOC":
            indices.append((itm[0], itm[1], 'L'))
        elif itm[2] == "HAZ":
            indices.append((itm[0], itm[1], 'H'))
        elif itm[2] == "DATE":
            indices.append((itm[0], itm[1], 'D'))
    indices.sort()
    if not indices:
        tokens = word_tokenize(text)
        tags = [0]*len(tokens)
        return tokens, tags
    for itm in indices:
        if itm[0] > start:
            final_list.append((start, itm[0], 'SKIP'))
        start = itm[1]
    if indices[-1][1] < (len(text) - 1):
        final_list.append((indices[-1][1], len(text)+1, "SKIP"))
    final_list.extend(indices)
    final_list.sort()
    for itm in final_list:
        if itm[2] == 'SKIP':
            var = word_tokenize(text[itm[0]:itm[1]])
            tokens.extend(var)
            tags.extend([0]*len(var))

        elif itm[2] == 'L':
            var = word_tokenize(text[itm[0]:itm[1]])
            tokens.extend(var)
            tags.extend([1]*len(var))

        elif itm[2] == 'H':
            var = word_tokenize(text[itm[0]:itm[1]])
            tokens.extend(var)
            tags.extend([2]*len(var))

        elif itm[2] == 'D':
            var= word_tokenize(text[itm[0]:itm[1]])
            tokens.extend(var)
            tags.extend([3]*len(var))
    return tokens, tags

In [None]:
#@ PREPARING THE DATASET:
bert_model_name = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
tokens, tags = [], []
for json_line in copy.deepcopy(json_lines):
    token, tag = convert_to_tokens(json_line)
    tokens.append(token)
    tags.append(tag)
print("Successfully completed!")

Successfully completed!


In [None]:
#@ LOADING CUSTOM MODEL:
tag2index = {"Other":0, "LOC":1, "HAZ":2, "DATE":3}
index2tag = {0:"Other", 1:"LOC", 2:"HAZ", 3:"DATE"}
bert_config = AutoConfig.from_pretrained(bert_model_name, num_labels=4,
                                         id2label=index2tag, label2id=tag2index)

In [None]:
#@ LOADING MODEL WEIGHTS:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (AutoModelForTokenClassification.from_pretrained(
    bert_model_name, config=bert_config).to(device))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(len(tokens))

1002


In [None]:
lengths = [len(sequence) for sequence in tokens]
unique_lengths = set(lengths)
print(unique_lengths)

{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 105, 106, 107, 108, 109, 114, 116, 117, 121, 130, 131}


In [None]:
len(unique_lengths)

112

In [None]:
print("Total number of sequences:", len(tokens))
print("Total number of sequences:", len(tags))

Total number of sequences: 1002
Total number of sequences: 1002


In [None]:
from collections import Counter
length_distribution = Counter(lengths)
for length, count in length_distribution.items():
    print(f"Length {length} appears {count} times")

Length 19 appears 5 times
Length 45 appears 21 times
Length 34 appears 22 times
Length 29 appears 10 times
Length 38 appears 19 times
Length 51 appears 14 times
Length 17 appears 5 times
Length 50 appears 27 times
Length 58 appears 15 times
Length 44 appears 23 times
Length 31 appears 10 times
Length 36 appears 21 times
Length 89 appears 3 times
Length 37 appears 18 times
Length 83 appears 4 times
Length 13 appears 11 times
Length 68 appears 9 times
Length 32 appears 11 times
Length 85 appears 2 times
Length 24 appears 7 times
Length 22 appears 2 times
Length 114 appears 1 times
Length 59 appears 14 times
Length 30 appears 15 times
Length 48 appears 30 times
Length 15 appears 8 times
Length 62 appears 12 times
Length 54 appears 17 times
Length 60 appears 13 times
Length 46 appears 31 times
Length 10 appears 3 times
Length 39 appears 27 times
Length 66 appears 8 times
Length 26 appears 5 times
Length 116 appears 1 times
Length 33 appears 16 times
Length 67 appears 14 times
Length 98 app

In [None]:
max_len = 131
padded_tokens = []
for seq in tokens:
    # Truncate if longer than max_len
    truncated = seq[:max_len]
    # Pad if shorter than max_len
    padded = truncated + ['<PAD>'] * (max_len - len(truncated))
    padded_tokens.append(padded)


In [None]:
#@ Visualisation of 5 sequences of tags before processing
for i in range(5):
    print(f"Sequence {i+1}: {tags[i]}")
    print(f"Length of sequence {i+1}: {len(tags[i])}\n")


Sequence 1: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 3, 0, 1, 1, 0]
Length of sequence 1: 19

Sequence 2: [0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 1]
Length of sequence 2: 45

Sequence 3: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 2, 1, 2, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2]
Length of sequence 3: 34

Sequence 4: [0, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]
Length of sequence 4: 29

Sequence 5: [0, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0, 0]
Length of sequence 5: 38



🛠️🎯**Padding** these sequences to a uniform length for input into a model, that clearly indicates 'no entity' or 'padding'. Since 0 is already being used in as no entity, it's the most logical choice for the padding value as well. This way, the padded positions will not be mistaken for actual entities.

In [None]:
padded_tags = []
for tag_seq in tags:
    # Truncate if longer than max_len
    truncated = tag_seq[:max_len]
    # Pad if shorter than max_len, using a tag that indicates 'padding' or 'no entity'
    padded = truncated + [0] * (max_len - len(truncated))
    padded_tags.append(padded)
# y = np.array(padded_tags)


In [None]:
#@ PREPARING THE DATASET:
X = np.array(padded_tokens)
y = np.array(padded_tags)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,
                                                    random_state=42)
train = pd.DataFrame({"tokens":X_train.tolist(), "ner_tags":y_train.tolist()})
validation = pd.DataFrame({"tokens":X_test.tolist(),
                           "ner_tags":y_test.tolist()})
train = Dataset.from_pandas(train)
validation = Dataset.from_pandas(validation)
data_dict = datasets.DatasetDict({"train":train, "test":validation})

In [None]:
#@ TOKENIZING TEXT FOR NER:
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#@ ENCODING DATASET:
dd_encoded = data_dict.map(tokenize_and_align_labels, batched=True)
dd_encoded['train']


Map:   0%|          | 0/701 [00:00<?, ? examples/s]

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 701
})

#### **Fine Tuning the HF Model**

In [None]:
# !pip install transformers[torch]
!pip install transformers[torch] -U



In [None]:
pip show accelerate

Name: accelerate
Version: 0.27.2
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [None]:
#@ INITIALIZING TRAINING ARGUMENTS:
num_epochs = 50
batch_size = 8
model_name = f"{bert_model_name}-finetuned"
training_args = TrainingArguments(
    output_dir=os.path.join(output_path, model_name),
    log_level="error",
    num_train_epochs=num_epochs,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    weight_decay=0.01, load_best_model_at_end=True,
    eval_steps=50, push_to_hub=False)

In [None]:
# #@ INITIALIZING TRAINING ARGUMENTS:
# num_epochs = 50
# batch_size = 8
# model_name = f"{bert_model_name}-finetuned"
# training_args = TrainingArguments(
#     output_dir=model_name, log_level="error",
#     num_train_epochs=num_epochs,
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     evaluation_strategy="steps",
#     weight_decay=0.01, load_best_model_at_end=True,
#     eval_steps = 50, push_to_hub=False)

#### **Performance Metrices**

In [None]:
#@ FUNCTION FOR PERFORMANCE MEASURES:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    return preds_list, labels_list

In [None]:
#@ COMPUTING METRICS:
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
#@ INITIALIZING DATA COLLATOR:
data_collator = DataCollatorForTokenClassification(bert_tokenizer)

#@ MODEL INITIALIZER:
def model_init():
    return (AutoModelForTokenClassification
            .from_pretrained(bert_model_name, config=bert_config)
            .to(device))


In [None]:

#@ TRAINING THE MODEL:
trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,                      # Computing metrics.
                  train_dataset=dd_encoded["train"],                    # Training dataset.
                  eval_dataset=dd_encoded["test"],                      # Test dataset.
                  tokenizer=bert_tokenizer,
                  callbacks = [EarlyStoppingCallback(
                      early_stopping_patience=2)])                      # Initializing trainer.
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
#@ MODEL EVALUATION:
model = AutoModelForTokenClassification.from_pretrained("/content/bert-base-uncased-finetuned/checkpoint-500").to("cpu")
nlp = pipeline('ner', model=model, tokenizer=bert_tokenizer)
s = """
stay safe. Gorkha earthquake in 2017 baisakh 15."""
print(nlp(s, aggregation_strategy="average"))