In [41]:
import pandas as pd
data=pd.read_csv('ner_dataset.csv',encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [42]:
from itertools import chain
def get_dict_map(data,token_or_tag):
    tok2idx={}
    idx2tok={}
    if token_or_tag=='token':
        vocab=list(set(data['Word'].to_list()))
    else:
        vocab=list(set(data['Tag'].to_list()))
    idx2tok={idx:tok for idx,tok in enumerate(vocab)}
    tok2idx={tok:idx for idx,tok in enumerate(vocab)}
    return tok2idx,idx2tok
token2idx,idx2token=get_dict_map(data,'token')
tag2idx,idx2tag=get_dict_map(data,'tag')

In [45]:
data['Word_idx']=data['Word'].map(token2idx)
data['Tag_idx']=data['Tag'].map(tag2idx)
data_fillna=data.fillna(method='ffill',axis=0)
data_group=data_fillna.groupby(
['Sentence #'],as_index=False
)['Word','POS','Tag','Word_idx','Tag_idx'].agg(lambda x:list(x))

In [46]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def get_pad_train_test_val(data_group,data):
    n_token=len(list(set(data['Word'].to_list())))
    n_tag=len(list(set(data['Tag'].to_list())))
    tokens=data_group['Word_idx'].tolist()
    maxlen=max([len(s) for s in tokens])
    pad_tokens=pad_sequences(tokens,maxlen=maxlen,dtype='int32',padding='post',value=n_token-1)
    tags=data_group['Tag_idx'].tolist()
    pad_tags=pad_sequences(tags,maxlen=maxlen,dtype='int32',padding='post',value=tag2idx["O"])
    n_tags=len(tag2idx)
    pad_tags=[to_categorical(i,num_classes=n_tags) for i in pad_tags]
    tokens_,test_tokens,tags_,test_tags=train_test_split(pad_tokens,pad_tags,test_size=0.1,train_size=0.9,random_state=2020)
    train_tokens,val_tokens,train_tags,val_tags=train_test_split(tokens_,tags_,test_size=0.25,train_size=0.75,random_state=2020)

    print(
        'train_tokens length:',len(train_tokens),
        '\ntrain_tokens length:',len(train_tokens),
        '\ntest_tokens length:',len(test_tokens),
        '\ntest_tags:',len(test_tags),
        '\nval_tokens:',len(val_tokens),
        '\nval_tags:',len(val_tags),
    )
    
    return train_tokens,val_tokens,test_tokens,train_tags,val_tags,test_tags

train_tokens,val_tokens,test_tokens,train_tags,val_tags,test_tags=get_pad_train_test_val(data_group,data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [47]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [48]:
input_dim=len(list(set(data['Word'].to_list())))+1
output_dim=64
input_length=max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags=len(tag2idx)

In [49]:
def get_bilstm_lstm_model():
    model=Sequential()
    model.add(Embedding(input_dim=input_dim,output_dim=output_dim,input_length=input_length))
    model.add(Bidirectional(LSTM(units=output_dim,return_sequences=True,dropout=0.2,recurrent_dropout=0.2),merge_mode='concat'))
    model.add(LSTM(units=output_dim,return_sequences=True,dropout=0.5,recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(n_tags,activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [50]:
def train_model(X,y,model):
    loss=list()
    for i in range(25):
        hist=model.fit(X,y,batch_size=1000,verbose=1,epochs=1,validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [51]:
results=pd.DataFrame()
model_bilstm_lstm=get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm']=train_model(train_tokens,np.array(train_tags),model_bilstm_lstm)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_1 (Bidirectio  (None, 104, 128)         66048     
 nal)                                                            
                                                                 
 lstm_3 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed_1 (TimeDis  (None, 104, 17)          1105      
 tributed)                                                       
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________
You must install pydot (`pip install pydot`)

In [64]:
import spacy
from spacy import displacy
nlp=spacy.load('en_core_web_sm')
text=nlp("Hi, My name is Samruddh Kamath\n. I am from Mumbai. I study at Narsee Monjee Institute of Management Studies. \n I love to eat burgers at McDonalds. I aspire to be like Nithin Kamath.")
displacy.render(text,style='ent',jupyter=True)