In [1]:
# Installing transformers library

!pip install transformers



In [2]:
# Installing PyTorch

!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting torchvision
  Downloading torchvision-0.18.0-cp38-cp38-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Downloading torchaudio-2.3.0-cp38-cp38-win_amd64.whl.metadata (6.4 kB)
Collecting torch
  Downloading torch-2.3.0-cp38-cp38-win_amd64.whl.metadata (26 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Downloading pillow-10.3.0-cp38-cp38-win_amd64.whl.metadata (9.4 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Downloading https://download.pytorch.org/whl/mkl-2021.4.0-py2.py3-none-win_amd64.whl (228.5 MB)
     ---------------------------------------- 0.0/228.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/228.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/228.5 MB ? eta -:--:--
     -------------------------------------- 0.0/228.5 MB 262.6 kB/s eta 0:14:31
     -------------------------------------- 0.0/228.5 MB 262.6 kB/s eta

In [None]:
# Importing necessary libraries 

import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

In [None]:
# Reading csv data

df = pd.read_csv('/content/ner.csv')
df.tail()

Unnamed: 0,text,labels
47954,Opposition leader Mir Hossein Mousavi has said...,O O O B-per I-per O O O O O O O O O O O O O O ...
47955,"On Thursday , Iranian state media published a ...",O B-tim O B-gpe O O O O O O O O B-org I-org O ...
47956,"Following Iran 's disputed June 12 elections ,...",O B-geo O O B-tim I-tim O O O O O O O O O O O ...
47957,"Since then , authorities have held public tria...",O O O O O O O O O O O O O O O O O O O O O
47958,The United Nations is praising the use of mili...,O B-org I-org O O O O O O O O O O O O O O B-ti...


In [None]:
# Creating tokenizer intsance

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
# Creating Dataset class

label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [None]:
# splitting the data into train, test and validation
# Defining Unique labels

df = df[0:2000]

labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

In [None]:
# Creating Model class

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [None]:
# Training the model [Transfer learning]

def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 15
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  0.530 | Accuracy:  0.869 | Val_Loss:  0.395 | Accuracy:  0.898


100%|██████████| 800/800 [02:36<00:00,  5.10it/s]


Epochs: 2 | Loss:  0.375 | Accuracy:  0.899 | Val_Loss:  0.348 | Accuracy:  0.905


100%|██████████| 800/800 [02:36<00:00,  5.10it/s]


Epochs: 3 | Loss:  0.328 | Accuracy:  0.910 | Val_Loss:  0.310 | Accuracy:  0.913


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 4 | Loss:  0.289 | Accuracy:  0.918 | Val_Loss:  0.293 | Accuracy:  0.917


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 5 | Loss:  0.254 | Accuracy:  0.928 | Val_Loss:  0.275 | Accuracy:  0.920


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 6 | Loss:  0.218 | Accuracy:  0.935 | Val_Loss:  0.253 | Accuracy:  0.926


100%|██████████| 800/800 [02:36<00:00,  5.10it/s]


Epochs: 7 | Loss:  0.195 | Accuracy:  0.940 | Val_Loss:  0.252 | Accuracy:  0.929


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 8 | Loss:  0.180 | Accuracy:  0.945 | Val_Loss:  0.244 | Accuracy:  0.929


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 9 | Loss:  0.166 | Accuracy:  0.950 | Val_Loss:  0.243 | Accuracy:  0.927


100%|██████████| 800/800 [02:37<00:00,  5.10it/s]


Epochs: 10 | Loss:  0.151 | Accuracy:  0.954 | Val_Loss:  0.235 | Accuracy:  0.931


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 11 | Loss:  0.138 | Accuracy:  0.956 | Val_Loss:  0.239 | Accuracy:  0.934


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 12 | Loss:  0.125 | Accuracy:  0.960 | Val_Loss:  0.249 | Accuracy:  0.932


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 13 | Loss:  0.116 | Accuracy:  0.962 | Val_Loss:  0.245 | Accuracy:  0.934


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 14 | Loss:  0.101 | Accuracy:  0.966 | Val_Loss:  0.255 | Accuracy:  0.936


100%|██████████| 800/800 [02:37<00:00,  5.09it/s]


Epochs: 15 | Loss:  0.095 | Accuracy:  0.969 | Val_Loss:  0.255 | Accuracy:  0.936


In [None]:
# Evaluate model

def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(model, df_test)

Test Accuracy:  0.946


In [None]:
# Predicting a sentence

def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
            
evaluate_one_text(model, 'Bill Gates is the founder of Microsoft')

Bill Gates is the founder of Microsoft
['B-per', 'I-per', 'O', 'O', 'O', 'O', 'B-org']
