# Loading the required Libraries

In [8]:
from transformers import AutoTokenizer
from simpletransformers.ner import NERModel

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score


In [10]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [11]:
data = pd.read_csv("ner_datasetreference.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
data.shape

(1048575, 4)

In [7]:
data["Tag"].unique()  #These manu unique tags are there

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [8]:
data.count()

Sentence #      47959
Word          1048575
POS           1048575
Tag           1048575
dtype: int64

In [12]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 17


O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [13]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('geo', 45058), ('org', 36927), ('per', 34241), ('tim', 26861), ('gpe', 16068), ('art', 699), ('eve', 561), ('nat', 252)]


In [14]:
entities_to_remove = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
data = data[~data.Tag.isin(entities_to_remove)]
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [15]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [15]:
data.shape

(1047063, 4)

In [16]:
label2id = {k: v for v, k in enumerate(data.Tag.unique())}
id2label = {v: k for v, k in enumerate(data.Tag.unique())}
label2id

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'I-per': 8,
 'I-gpe': 9,
 'I-tim': 10}

In [16]:
data=data[['Word','Tag']].copy()

In [18]:
# Convert this into list of tuples having words and tags 
normalized_data=[]
issue_data=[]
for i in range(len(data)):
  try:
    normalized_data.append((data['Word'][i],data['Tag'][i],[],0,0))
  except:
    pass




In [19]:
len(normalized_data)

1045551

In [20]:
normalized_data[:5]

[('Thousands', 'O', [], 0, 0),
 ('of', 'O', [], 0, 0),
 ('demonstrators', 'O', [], 0, 0),
 ('have', 'O', [], 0, 0),
 ('marched', 'O', [], 0, 0)]

In [24]:
mtype='xlmroberta'
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base',use_fast=True)
model = NERModel(mtype,"xlm-roberta-base",use_cuda=False)



Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [25]:
batch_deets=[]

In [1]:
import re
def preprocess(word_deets):
    processed_word_deets=[]
    idx=0
    while (idx<len(word_deets)):
        w_txt,w_tag,w_coods,w_blk,w_pgno=word_deets[idx]
        w_txt=re.sub(r'[\u200c\u200d`.]','',w_txt)
        processed_word_deets.append((w_txt,w_tag,w_coods,w_blk,w_pgno))
        idx+=idx+1
    return processed_word_deets



In [27]:
len(normalized_data)

1045551

In [None]:
p=preprocess(normalized_data)

In [None]:
tags=[data[1] for data in p]

In [None]:
len(tags)

960144

In [None]:
#Performing Tokenization of the dataset
txt=" ".join([data[0] for data in p])
assert len(txt.split())==len(p),f"clean text does not match with processed data"

# Tokenization as per the below explained Logic

In [None]:
encoded_txt=tokenizer(txt,truncation=False)
word_ids=encoded_txt.word_ids()
encoded_txt['input_ids']=encoded_txt['input_ids'][1:-1]
word_ids=word_ids[1:-1]
assert len(encoded_txt['input_ids'])==len(word_ids),f"words ids done match tokens"
encoded_txt['word_ids']=word_ids

In [None]:
t=encoded_txt

In [None]:
t.keys()

dict_keys(['input_ids', 'attention_mask', 'word_ids'])

In [None]:
def gen_splits(seq, size, overlap):
  splits=[]
  if len(seq)>size:
    for i in range(0,len(seq)-overlap,size-overlap):
      splits.append(seq[i:i+size])
  else:
    splits.append(seq)
  return splits

def chunkize(encoded_txt):
  #Account for the '<s>' and '</s>' tokens being added
  max_length=448
  chunks={}
  #overlap=max_length*overlap_fraction
  overlap=270
  chunks['input_ids']=gen_splits(encoded_txt['input_ids'],max_length,overlap)
  chunks['word_ids']=gen_splits(encoded_txt['word_ids'],max_length,overlap)
  #assert len(chunks['input_ids'])==len(chunks['word_ids']),f"chunk size mismatch"
  return chunks


In [None]:
max_length: int = 450
overlap_fraction: float=0.6
c=chunkize(t)

In [None]:
len(t)

3

# Explaination of how this logic works

In [23]:

txt="Deepak is playing cricket"
encoded_txt=tokenizer(txt,truncation=False)
encoded_txt

{'input_ids': [0, 62723, 344, 83, 75169, 13625, 27853, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
tokens = tokenizer.tokenize(txt)
tokens

['▁Deep', 'ak', '▁is', '▁playing', '▁cri', 'cket']

In [25]:
word_ids=encoded_txt.word_ids()
word_ids

[None, 0, 0, 1, 2, 3, 3, None]

In [None]:
def gen_splits(seq, size, overlap):
  splits=[]
  if len(seq)>size:
    for i in range(0,len(seq)-overlap,size-overlap):
      splits.append(seq[i:i+size])
  else:
    splits.append(seq)
  return splits

# Below Function will be used to break any length sentence into chunks which will not cross 450 tokens

In [None]:
def chunkize(encoded_txt):
  #Account for the '<s>' and '</s>' tokens being added
  max_length=448
  chunks={}
  #overlap=max_length*overlap_fraction
  overlap=270
  chunks['input_ids']=gen_splits(encoded_txt['input_ids'],max_length,overlap)
  chunks['word_ids']=gen_splits(encoded_txt['word_ids'],max_length,overlap)
  assert len(chunks['input_ids'])==len(chunks['word_ids']),f"chunk size mismatch"
  return chunks


In [None]:
max_length: int = 450
overlap_fraction: float=0.6
c=chunkize(t)

In [None]:
len(c)

2

In [None]:
len(c['input_ids'][1:])

6929

In [None]:
X=[(tokenizer.decode(t,clean_up_tokenization_spaces=False).split(),sorted(set(w))) for (t,w) in zip(c['input_ids'],c['word_ids'])]

In [None]:
len(X)

6930

In [None]:
y=[[tags[id] for id in sorted(set(w_ids))] for w_ids in c['word_ids']]

In [None]:
len(y)

6930

In [None]:
from collections import defaultdict

In [None]:
batch_df=defaultdict(list)
idx=0
last_sentence_id=0
if 'sentence_id' in batch_df and len(batch_df['sentence_id']>1):
  last_sentece_id=max(batch_df['sentence_id']+1)
for i, ((word,ids),lbl) in enumerate(zip(X,y)):
  for w,id,l in zip(word,ids,lbl):
    batch_df['sentence_id'].append(last_sentence_id+i)
    batch_df['words'].append(w)
    batch_df['word_ids'].append(id)
    batch_df['labels'].append(l)
    batch_df['bboxes'].append(p[id][2])
    batch_df['block'].append(p[id][3])
    batch_df['pgno'].append(p[id][4])
    batch_df['file'].append(id)


In [None]:
inf_df,batch_deets=pd.DataFrame(batch_df),batch_deets

In [None]:
inf_df.head()

Unnamed: 0,sentence_id,words,word_ids,labels,bboxes,block,pgno,file
0,0,Thousands,0,O,[],0,0,0
1,0,of,1,O,[],0,0,1
2,0,demonstrators,2,O,[],0,0,2
3,0,have,3,O,[],0,0,3
4,0,marched,4,O,[],0,0,4


In [None]:
inf_df.to_csv("Final_processed_Dataframe.csv")

In [7]:
import pandas as pd

In [8]:
inf_df=pd.read_csv("Final_processed_Dataframe.csv")

In [9]:
train_data=inf_df[["sentence_id", "words", "labels"]]

In [None]:
label2id.keys()

dict_keys(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'I-per', 'I-gpe', 'I-tim'])

In [18]:
from simpletransformers.ner import NERModel, NERArgs

mtype='xlmroberta'
model_args = NERArgs()
model_args.num_train_epochs=20
model_args.labels_list = ['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'I-per', 'I-gpe', 'I-tim']
model_args.overwrite_output_dir = True
model = NERModel(mtype,"xlm-roberta-base",args=model_args,use_cuda=True)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [12]:

train_d = train_data.sample(frac = 0.9)


eval_data = train_data.drop(train_d.index)

# Model Training

### Model has been trained in colab as my laptop does not have GPU
### Trained Model File has been uplpaded

In [25]:
model.train_model(train_d)

  0%|          | 0/2 [00:00<?, ?it/s]



Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 0 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 1 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 2 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 3 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 4 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 5 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 6 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 7 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 8 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

Running Epoch 9 of 20:   0%|          | 0/867 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

# Inference can be done by loading the model as follows

In [1]:
from simpletransformers.ner import NERModel, NERArgs

mtype='xlmroberta'
model_args = NERArgs()
# model_args.num_train_epochs=20
model_args.labels_list = ['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'I-per', 'I-gpe', 'I-tim']
# model_args.overwrite_output_dir = True
model = NERModel(mtype,r"C:\Users\datas\OneDrive\Desktop\OLD_IITJ_Docs\IIT Sem2\AML\Project\submission\Trained_XLM_model\checkpoint-7803-epoch-9",args=model_args,use_cuda=False)

In [18]:
result, model_outputs, wrong_preds = model.eval_model(eval_data)

  0%|          | 0/5 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/867 [00:00<?, ?it/s]

# With Only 10 Epochs the accuracy and precision is ~80%

In [20]:
result

{'eval_loss': 0.18858808251538217,
 'precision': 0.7951590265088319,
 'recall': 0.7954750172203677,
 'f1_score': 0.7953169904777041}

In [None]:
predictions, raw_outputs = model.predict(["I am travelling to India  and I work for Google", "I am a Microsoft employee"])

  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
predictions

[[{'I': 'O'},
  {'am': 'O'},
  {'travelling': 'O'},
  {'to': 'O'},
  {'India': 'B-geo'},
  {'and': 'O'},
  {'I': 'O'},
  {'work': 'O'},
  {'for': 'O'},
  {'Google': 'B-org'}],
 [{'I': 'O'},
  {'am': 'O'},
  {'a': 'O'},
  {'Microsoft': 'B-org'},
  {'employee': 'O'}]]

In [21]:
p,r=model.predict(["""
India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia. 
on, initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse,
second only to Africa in human genetic diversity.Settled life emerged on the 
subcontinent in the western margins of the Indus river basin 9,000 years ago, evolving gradually into
the Indus Valley Civilisation of the third millennium BCE.[30] By 1200 BCE, an archaic form of Sanskrit, 
an Indo-European language, had diffused into India from the northwest.[31][32] Its evidence today
is found in the hymns of the Rigveda. Preserved by a resolutely vigilant oral tradition, 
the Rigveda records the dawning of Hinduism in India. The Dravidian languages of India were 
supplanted in the northern and western regions.[34] By 400 BCE, stratification and exclusion by 
caste had emerged within Hinduism,[35] and Buddhism and Jainism had arisen, proclaiming social orders 
unlinked to heredity.[36] Early political consolidations gave rise to the loose-knit Maurya and Gupta Empires based
in the Ganges Basin.[37] Their collective era was suffused with wide-ranging creativity,[38] but also marked by 
the declining status of women,[39] and the incorporation of untouchability into an organised system of belief.[g][40] 
In South India, the Middle kingdoms exported Dravidian-languages scripts and  cultures to the kingdoms of Southeast Asia

"""])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]