In [1]:
## Convert labeled data to BILOU format
### Create dataset with one word per row

#BILOU format
#single word,entity  once we encode/tokenize the word we get sequence of tokens instead of 1 token because BERT will add extra   
#tokens like sub word,CLS,SEP   
#Can we treat this sequence of tokens of a single word as sequence classification for NER?   
#I've explored this in this notebook.    

In [2]:
import json
with open('../../data/paragraphs/10_paragraph_clean_data_annotated.jsonl', 'r') as file:
    labeled_data =  [json.loads(line) for line in file]

In [3]:
labeled_data[0]

{'id': 2970,
 'text': 'Summary Summary Companies Tesla down as Q3 deliveries miss market estimates U S factory activity slowest in 2 5 years in Sept ISM Credit Suisse Citi cut 2022 year end target for S P 500 Indexes up Dow 2 66 S P 500 2 59 Nasdaq 2 27 Oct 3 Reuters Wall Street is three major indexes rallied to close over 2 on Monday as U S Treasury yields tumbled on weaker than expected manufacturing data increasing the appeal of stocks at the start of the year is final quarter.The U S stock market has suffered three quarterly declines in a row in a tumultuous year marked by interest rate hikes to tame historically high inflation and concerns about a slowing economy.The U S yield markets are pulling back that is been a positive and that connotes a more risk on environment said Art Hogan chief market strategist at B Riley Wealth in Boston.Register now for FREE unlimited access to Reuters com Register Further supporting rate sensitive growth stocks the benchmark U S 10 year Treasury yi

In [4]:
#Convert this data to BILOU format using Spacy
import spacy
from spacy.training import offsets_to_biluo_tags
nlp = spacy.load("en_core_web_lg")


In [5]:
import pandas as pd


In [6]:
entities = []
final_data = pd.DataFrame(columns = ['word','ner_tag'])
for row in labeled_data:
    doc = nlp(row['text'])
    entities = row['label']
     
    iob_tags = offsets_to_biluo_tags(doc, entities)
    for i in range(len(iob_tags)):
        new_row = pd.Series({'word': doc[i].text, 'ner_tag': iob_tags[i]}) 
        
        final_data = pd.concat([final_data, new_row.to_frame().T], ignore_index=True)



In [7]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3613 entries, 0 to 3612
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   word     3613 non-null   object
 1   ner_tag  3613 non-null   object
dtypes: object(2)
memory usage: 56.6+ KB


In [8]:
final_data.ner_tag.unique()
'''
The string "-" is used where the entity offsets don’t align with the tokenization in the Doc object. 
The training algorithm will view these as missing values. 
O denotes a non-entity token. 
B denotes the beginning of a multi-token entity, 
I the inside of an entity of three or more tokens,
and L the end of an entity of two or more tokens. 
U denotes a single-token entity.

'''

'\nThe string "-" is used where the entity offsets don’t align with the tokenization in the Doc object. \nThe training algorithm will view these as missing values. \nO denotes a non-entity token. \nB denotes the beginning of a multi-token entity, \nI the inside of an entity of three or more tokens,\nand L the end of an entity of two or more tokens. \nU denotes a single-token entity.\n\n'

In [9]:
#We can consider - as non entity.
final_data['ner_tag'].replace(['-'],
                        ["O"], inplace=True)

In [10]:
final_data.to_csv('../../data/paragraphs/BIOUL_format_data_words.csv',index=False)

In [11]:
import os
import random
import numpy as np
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [58]:
df = pd.read_csv('../../data/paragraphs/BIOUL_format_data_words.csv')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3613 entries, 0 to 3612
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   word     3613 non-null   object
 1   ner_tag  3613 non-null   object
dtypes: object(2)
memory usage: 56.6+ KB


In [60]:
df.rename(columns = {'word':'text', 'ner_tag':'label'}, inplace = True)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3613 entries, 0 to 3612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3613 non-null   object
 1   label   3613 non-null   object
dtypes: object(2)
memory usage: 56.6+ KB


In [62]:
df.label.value_counts()

O                  3378
B-PERSON_CUSTOM      36
L-PERSON_CUSTOM      36
L-ORG_CUSTOM         24
B-ORG_CUSTOM         24
U-ORG_CUSTOM         20
I-ORG_CUSTOM         17
B-TIME_CUSTOM        14
L-TIME_CUSTOM        14
L-PLACE_CUSTOM       12
B-PLACE_CUSTOM       12
U-TIME_CUSTOM         7
U-PLACE_CUSTOM        6
I-PERSON_CUSTOM       5
I-TIME_CUSTOM         4
I-PLACE_CUSTOM        1
B-MONEY_CUSTOM        1
I-MONEY_CUSTOM        1
L-MONEY_CUSTOM        1
Name: label, dtype: int64

In [63]:
# replacing values
df['label'].replace(['O', 'B-TIME_CUSTOM', 'I-TIME_CUSTOM', 'L-TIME_CUSTOM','U-TIME_CUSTOM',
       'B-PLACE_CUSTOM','I-PLACE_CUSTOM', 'L-PLACE_CUSTOM', 'U-PLACE_CUSTOM',
       'B-PERSON_CUSTOM','I-PERSON_CUSTOM', 'L-PERSON_CUSTOM', 'U-PERSON_CUSTOM',
       'B-ORG_CUSTOM','I-ORG_CUSTOM', 'L-ORG_CUSTOM', 'U-ORG_CUSTOM',
       'B-MONEY_CUSTOM','I-MONEY_CUSTOM', 'L-MONEY_CUSTOM', 'U-MONEY_CUSTOM'],
                        [0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], inplace=True)

In [64]:
df.label.value_counts()

0     3378
9       36
11      36
15      24
13      24
16      20
14      17
1       14
3       14
7       12
5       12
4        7
8        6
10       5
2        4
6        1
17       1
18       1
19       1
Name: label, dtype: int64

In [65]:
ner_tags = ['O', 'B-TIME_CUSTOM', 'I-TIME_CUSTOM', 'L-TIME_CUSTOM','U-TIME_CUSTOM',
       'B-PLACE_CUSTOM','I-PLACE_CUSTOM', 'L-PLACE_CUSTOM', 'U-PLACE_CUSTOM',
       'B-PERSON_CUSTOM','I-PERSON_CUSTOM', 'L-PERSON_CUSTOM', 'U-PERSON_CUSTOM',
       'B-ORG_CUSTOM','I-ORG_CUSTOM', 'L-ORG_CUSTOM', 'U-ORG_CUSTOM',
       'B-MONEY_CUSTOM','I-MONEY_CUSTOM', 'L-MONEY_CUSTOM', 'U-MONEY_CUSTOM']
id_to_label = dict(enumerate(ner_tags))
label_to_id = {v:k for k, v in id_to_label.items()}

In [68]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
#The AutoTokenizer.from_pretrained method takes in the name of the model to build the appropriate tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#We can  ask the model to return all hidden states and all attention weights if we need them:output_hidden_states=True, output_attentions=True
#But in this we don't need them
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=21)
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [69]:
#Tokenization: This will return input ids:list of numbers,these numberrs are fetched from pretrained vocab
tokenized_text =tokenizer(list(df["text"]),padding=True,truncation=True,max_length=768)

In [70]:
tokenized_text.keys()

dict_keys(['input_ids', 'attention_mask'])

In [71]:
input_ids = torch.tensor(tokenized_text['input_ids'])

In [72]:
#Dataset preparation
from torch.utils.data import Dataset, TensorDataset,DataLoader
from sklearn.model_selection import train_test_split

BATCH_SIZE = 16
X = input_ids
print(X.shape)
print(X[0])


torch.Size([3613, 7])
tensor([  101, 12654,   102,     0,     0,     0,     0])


In [73]:
SEQUENCE_LENGTH = len(X[0])

In [74]:
type(df.label[0])

numpy.int64

In [75]:
df['label'].values

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [76]:
type(df['label'].values)

numpy.ndarray

In [78]:

y = torch.from_numpy(df['label'].values)



In [79]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
train_data = TensorDataset(X, y)
train_loader = DataLoader(train_data,batch_size=BATCH_SIZE, shuffle=True)

In [80]:
#Model training
NUM_EPOCHS = 1
LEARNING_RATE = 0.01
optimizer =torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) 
loss_fn = torch.nn.CrossEntropyLoss()
for i in range(NUM_EPOCHS):
  model.train()
  for X_batch,y_batch in train_loader:
    output = model(X_batch,labels=y_batch)
    loss = loss_fn(output.logits,y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [81]:
model.save_pretrained("../models/custom_ner_dl")

In [82]:
saved_model = AutoModelForSequenceClassification.from_pretrained("../models/custom_ner_dl")

In [83]:
#Inference Code
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenized_text =tokenizer(["America"],padding=True,truncation=True,max_length=512,return_tensors='pt').to(device)
input_ids = torch.tensor(tokenized_text['input_ids']).to(device)


with torch.no_grad():
    outputs = saved_model(input_ids)

print(input_ids)
print(outputs.logits)
predicted_class_id = output.logits.argmax(dim= -1)
print(predicted_class_id)
print(id_to_label[predicted_class_id[0].item()])

tensor([[ 101, 2637,  102]])
tensor([[  3.3842,  -3.9077,  -9.7438,  -3.7688,  -4.7408,  -5.5589, -12.0809,
          -5.6331,  -7.7037,  -3.2073,  -8.1401,  -2.2882, -14.0165,  -3.5293,
          -3.2150,  -3.1409,  -3.4714, -10.1137, -12.3099, -11.5177, -14.1804]])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
O


  input_ids = torch.tensor(tokenized_text['input_ids']).to(device)
