<a href="https://colab.research.google.com/github/aniruddhapal/NLP-Named-Entity-Recognition-/blob/main/Named_Entity_Recognition_v1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from numpy.random import seed
from itertools import chain
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional
plt.style.use('seaborn')

In [None]:
data=pd.read_csv('../input/ner-dataset/ner_datasetreference.csv',encoding='unicode_escape')
data.head()

In [None]:
#Delete the POS column as we do not need it
data.drop('POS',axis='columns',inplace=True)

In [None]:
data.head(15)

In [None]:
#NaN values to be filled using Forward Fill method.
data=data.fillna(method='ffill')

In [None]:
data.head(35)

In [None]:
words=list(set(data['Word'].values))
words.append("ENDPAD")
words[-1]

In [None]:
#Total no. of words
num_words=len(data['Word'])
print("Total number of words",num_words)

In [None]:
#List of Tags and total no. of Tags
tags = list(set(data["Tag"].values))
num_tags = len(tags)
num_tags
print("List of tags: " + ', '.join([tag for tag in tags]))
print(f"Total Number of tags {num_tags}")

In [None]:
data.head()

In [None]:
#Creating a class which will give the complete sentence with Tags attached with each word
class Get_sentence(object):
  def __init__(self,data):
    self.n_sent=1
    self.data=data
    agg_func=lambda s:[(w,t) for w,t in zip(s['Word'].tolist(),s['Tag'].tolist())]
    self.grouped=self.data.groupby('Sentence #').apply(agg_func)
    self.sentences=[s for s in self.grouped]

In [None]:
getter=Get_sentence(data)
sentence=getter.sentences

In [None]:
sentence[0]

In [None]:
### Data Visualizaion

In [None]:
plt.figure(figsize=(14,7))
plt.hist([len(s) for s in sentence],bins = 50)
plt.xlabel("Length of Sentences")
plt.show()

In [None]:
# most of our sentenses have a length of 20 words, the longest sentense is around 63 words



In [None]:
plt.figure(figsize=(14, 7))
data.Tag[data.Tag != 'O']\
    .value_counts()\
    .plot\
    .barh();

In [None]:
## We can understand from the bar chart above that B-geo Tags are over-represented. 
## I-nat and I-gpe Tags are negligible and almost non-existent. Model will have difficulty in classifying these 2 tags.

In [None]:
# Data Preparation

In [None]:
word_idx = {w : i + 1 for i ,w in enumerate(words)}
tag_idx =  {t : i for i ,t in enumerate(tags)}

In [None]:
word_idx['Dniester']

In [None]:
# Padding our sentences using max lenght of 50.
max_len=50
X=[[word_idx[w[0]] for w in s] for s in sentence]
X=pad_sequences(maxlen=max_len,sequences=X,padding='post',value=num_words-1)
y=[[tag_idx[w[1]]for w in s]for s in sentence]
y=pad_sequences(maxlen=max_len,sequences=y,padding='post',value=tag_idx['O'])


In [None]:
X[0]

In [None]:
y[0]

In [None]:
y=[to_categorical(i,num_classes=num_tags) for i in y]

In [None]:
num_tags

In [None]:
y[0]

In [None]:
# Split the data using Train Test split.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=1)

In [None]:
# Building the model

In [None]:
input_word=Input(shape=(max_len,))
model=Embedding(input_dim=num_words,output_dim=max_len,input_length=max_len)(input_word)
model=SpatialDropout1D(0.1)(model)
model=Bidirectional(LSTM(units=100,return_sequences=True,recurrent_dropout=0.1))(model)
out=TimeDistributed(Dense(num_tags,activation='softmax'))(model)
model=Model(input_word,out)

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
plot_model(model,show_shapes=True)

In [None]:
#Training the model with Training data
model.fit(X_train,np.array(y_train),batch_size=64,verbose=1,epochs=3,validation_split=0.2)

In [None]:
# Model Evaluation

In [None]:
model.evaluate(X_test,np.array(y_test))

In [None]:
# Model Prediction with Test set
rand_sent=np.random.randint(0,X_test.shape[0])
p=model.predict(np.array([X_test[rand_sent]]))
p=np.argmax(p,axis=-1)

y_true=np.argmax(np.array(y_test),axis=-1)[rand_sent]

print("{:20}{:20}\t{}\n".format("Word","True","Pred"))
print("-"*55)

for (w,t,pred)in zip(X_test[rand_sent],y_true,p[0]):
    print("{:20}{:20}\t{}".format(words[w-1],tags[t],tags[pred]))