<a href="https://colab.research.google.com/github/sagarikajadon13/NLP-using-transformers/blob/main/Entity_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PROCURING DATA FROM KAGGLE

In [None]:
!pip install kaggle



In [None]:
! mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download abhinavwalia95/entity-annotated-corpus

Downloading entity-annotated-corpus.zip to /content
 38% 10.0M/26.4M [00:00<00:00, 98.9MB/s]
100% 26.4M/26.4M [00:00<00:00, 144MB/s] 


In [None]:
!unzip /content/entity-annotated-corpus.zip

Archive:  /content/entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 34.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing

In [None]:
import joblib 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GroupKFold

from tqdm import tqdm
import transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

device= torch.device('cuda')

In [None]:
class CONFIG:
  MAX_LENGTH= 128
  TRAIN_BATCHSIZE= 32
  TEST_BATCHSIZE= 8
  EPOCHS= 3
  BASE_MODEL= 'bert-base-uncased'
  TOKENIZER= BertTokenizer.from_pretrained(
      BASE_MODEL,
      do_lower_case= True
  )
  LEARNING_RATE= 3e-5

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
dataset= pd.read_csv('/content/ner_dataset.csv', encoding="latin-1")

In [None]:
dataset

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


# PREPROCESS DATA

In [None]:
def preprocess_data(df):
  dataset= df.copy()
  dataset.loc[:, 'Sentence #']= dataset["Sentence #"].fillna(method="ffill")

  enc_pos= LabelEncoder()
  enc_tag= LabelEncoder()

  dataset.loc[:, 'POS']= enc_pos.fit_transform(dataset['POS'])
  dataset.loc[:, 'Tag']= enc_tag.fit_transform(dataset['Tag'])

  #can also see as word level classification for pos and tags
  sentences= dataset.groupby('Sentence #')['Word'].apply(list).values
  pos= dataset.groupby('Sentence #')['POS'].apply(list).values
  tag= dataset.groupby('Sentence #')['Tag'].apply(list).values

  return sentences, pos, tag, enc_pos, enc_tag

# CREATE ENTITY DATASET

In [None]:
class EntityDataset:
  def __init__(self, texts, pos, tags):
    '''
    texts= list of words in the sentence(split by whitespace)
    pos= list of pos for each word in the sentence
    tags= list of tags for each word in the sentence
    '''
    self.texts= texts
    self.pos= pos
    self.tags= tags

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text= self.texts[idx]
    pos= self.pos[idx]
    tags= self.tags[idx]

    ids= []
    target_pos= []
    target_tags= []

    for i, word in enumerate(text):
      #tokenize every word in the text (list of words) for bert
      #enocode_plus will return the same information dictionary, however, only for word, we need to preprocess pos and tags as well
      #encode- gives a list of bert tokens for the word(same as convert_ids_to_tokens)

      inputs= CONFIG.TOKENIZER.encode(
          word,
          add_special_tokens= False
      )

      input_len= len(inputs)
      ids.extend(inputs)
      target_pos.extend([pos[i]]* input_len)
      target_tags.extend([tags[i]]* input_len)

    ids= ids[: CONFIG.MAX_LENGTH- 2]
    target_pos= target_pos[: CONFIG.MAX_LENGTH- 2]
    target_tags= target_tags[: CONFIG.MAX_LENGTH- 2]
    
    ids= [101]+ ids+ [102]
    target_pos= [0]+ target_pos+ [0]
    target_tags= [0]+ target_tags+ [0]

    mask= [1]* len(ids)   
    token_type_ids= [0]* len(ids)

    padding_len= CONFIG.MAX_LENGTH- len(ids)

    ids= ids+ ([0]* padding_len)
    target_pos= target_pos+ ([0]* padding_len)
    target_tags= target_tags+ ([0]* padding_len)
    mask= mask+ ([0]* padding_len)
    token_type_ids= token_type_ids+ ([0]* padding_len)

    return {
        'ids': torch.tensor(ids, dtype= torch.long),
        'target_pos': torch.tensor(target_pos, dtype= torch.long),
        'target_tags': torch.tensor(target_tags, dtype= torch.long),
        'mask': torch.tensor(mask, dtype= torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype= torch.long),
    }

# LOSS FUNCTION

In [None]:
#calculate loss only where there is no padding
#lfn.ignore_index= -100
def loss_fn(logits, target, mask, num_labels):
  lfn= nn.CrossEntropyLoss()
  active_loss= mask.view(-1)== 1
  active_logits= logits.view(-1, num_labels)
  active_labels= torch.where(active_loss, target.view(-1), torch.tensor(lfn.ignore_index).type_as(target))
  loss= lfn(active_logits, active_labels)
  return loss

# BERT MODEL FOR ENTITY EXTRACTION

In [None]:
class EntityModel(nn.Module):
  def __init__(self, num_pos, num_tags):
    super(EntityModel, self).__init__()
    self.num_tags= num_tags
    self.num_pos= num_pos
    self.bert= BertModel.from_pretrained(CONFIG.BASE_MODEL)

    self.out_pos= nn.Linear(768, num_pos)
    self.out_tag= nn.Linear(768, num_tags)

    self.dropout1= nn.Dropout(0.3)
    self.dropout2= nn.Dropout(0.3)

  def forward(self, ids, target_pos, target_tags, mask, token_type_ids):
    cls, _ = self.bert(ids,
                   attention_mask= mask,
                   token_type_ids= token_type_ids,
                   output_attentions= False,
                   output_hidden_states= False,
                   return_dict = False)
    
    
    pos= self.out_pos(self.dropout1(cls))
    tags= self.out_tag(self.dropout2(cls))

    loss_tag= loss_fn(tags, target_tags, mask, self.num_tags)
    loss_pos= loss_fn(pos, target_pos, mask, self.num_pos)

    loss= (loss_tag+ loss_pos)/2
    return tags, pos, loss

# TRAIN AND EVAL

In [None]:
def train_fn(dataloader, model, optimizer, scheduler):
  model.train()
  train_loss= 0

  for data in tqdm(dataloader, total= len(dataloader)):
    for k, v in data.items():
      data[k]= v.to(device)

    optimizer.zero_grad()
    _, _, loss= model(**data)

    loss.backward()
    optimizer.step()
    scheduler.step()
    train_loss+= loss.item()
    
  return train_loss/len(dataloader)

In [None]:
def eval_fn(dataloader, model):
  model.eval()
  eval_loss= 0

  with torch.no_grad():
    for data in tqdm(dataloader, total= len(dataloader)):
      for k, v in data.items():
        data[k]= v.to(device)
    
      _, _, loss= model(**data)
      eval_loss+= loss.item()
    
  return eval_loss/len(dataloader)

In [None]:
sentences, pos, tag, enc_pos, enc_tag= preprocess_data(dataset)

In [None]:
enc_pos.classes_

array(['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ',
       'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT',
       'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH',
       'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
       '``'], dtype=object)

In [None]:
num_pos= len(enc_pos.classes_)
num_tags= len(enc_tag.classes_)

In [None]:
(train_sentences, test_sentences, train_pos, test_pos, train_tags, test_tags)= train_test_split(sentences, pos, tag, random_state= 42, test_size= 0.1)

In [None]:
train_dataset= EntityDataset(train_sentences, train_pos, train_tags)
train_loader= DataLoader(train_dataset, batch_size= CONFIG.TRAIN_BATCHSIZE)

test_dataset= EntityDataset(test_sentences, test_pos, test_tags)
test_loader= DataLoader(test_dataset, batch_size= CONFIG.TEST_BATCHSIZE)

In [None]:
model= EntityModel(num_pos, num_tags)
model.to(device)

param_optimizer= list(model.named_parameters())
no_decay= ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_parameters= [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
                       {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0}]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
optimizer= torch.optim.AdamW(model.parameters(), lr= CONFIG.LEARNING_RATE)
scheduler= get_linear_schedule_with_warmup(optimizer, num_warmup_steps= 0, num_training_steps= len(train_loader)*CONFIG.EPOCHS)

In [None]:
best_val_loss= float('inf')
train_loss= []
val_loss= []

for i in range(CONFIG.EPOCHS):
  avg_train_loss= train_fn(train_loader, model, optimizer, scheduler)
  avg_val_loss= eval_fn(test_loader, model)

  train_loss.append(avg_train_loss)
  val_loss.append(avg_val_loss)

  if avg_val_loss< best_val_loss:
    best_val_loss= avg_val_loss
    filename= 'entity_bert.pth'
    torch.save(model.state_dict(), filename)

100%|██████████| 1349/1349 [29:40<00:00,  1.32s/it]
100%|██████████| 600/600 [01:24<00:00,  7.14it/s]
100%|██████████| 1349/1349 [29:49<00:00,  1.33s/it]
100%|██████████| 600/600 [01:25<00:00,  7.03it/s]
100%|██████████| 1349/1349 [29:35<00:00,  1.32s/it]
100%|██████████| 600/600 [01:23<00:00,  7.14it/s]


In [None]:
sum(train_loss)/3

0.1327429010337419

In [None]:
sum(val_loss)/3

0.0982855847157124

# INFERENCE

In [None]:
sentence= 'Indira Gandhi was the first female Prime Minister of India.'

In [None]:
tokenized_sentence= CONFIG.TOKENIZER.encode(sentence)

In [None]:
sentence= sentence.split()

In [None]:
tokenized_sentence

[101, 28232, 12338, 2001, 1996, 2034, 2931, 3539, 2704, 1997, 2634, 1012, 102]

In [None]:
sentence

['Indira',
 'Gandhi',
 'was',
 'the',
 'first',
 'female',
 'Prime',
 'Minister',
 'of',
 'India.']

In [None]:
test_dataset= EntityDataset(texts= [sentence], pos= [[0]* len(sentence)], tags= [[0]* len(sentence)])
test_loader= DataLoader(test_dataset, batch_size= 1)

In [None]:
model.load_state_dict(torch.load('entity_bert.pth'))

<All keys matched successfully>

In [None]:
test_dataset[0]['ids'].shape

torch.Size([128])

In [None]:
model.eval()
with torch.no_grad():
    for data in test_loader:
      for k, v in data.items():
        data[k]= v.to(device)
      tags, pos, _= model(**data)

In [None]:
tags.shape

torch.Size([1, 128, 17])

In [None]:
sentence

['Indira',
 'Gandhi',
 'was',
 'the',
 'first',
 'female',
 'Prime',
 'Minister',
 'of',
 'India.']

In [None]:
enc_tag.inverse_transform(tags.argmax(2).cpu().numpy().reshape(-1))[: len(tokenized_sentence)]

array(['B-art', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'B-geo', 'O', 'B-art'], dtype=object)

In [None]:
enc_pos.inverse_transform(pos.argmax(2).cpu().numpy().reshape(-1))[: len(tokenized_sentence)]

array(['$', 'NNP', 'NNP', 'VBD', 'DT', 'JJ', 'JJ', 'JJ', 'NN', 'IN',
       'NNP', '.', '$'], dtype=object)