In [None]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, Model, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
from keras.utils import to_categorical
from numpy.random import seed
from itertools import chain
from spacy import displacy
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow
import spacy
seed(1)
tensorflow.random.set_seed(2)


In [None]:
data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Data Preparation for Neural Networks

In [None]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}

    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [None]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [None]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 6728 
train_tokens length: 6728 
test_tokens length: 997 
test_tags: 997 
val_tokens: 2243 
val_tags: 2243


# Training Neural Network

In [None]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [None]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
def train_model(X, y, model):
    loss = list()
    for i in tqdm(range(25)):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 70, 64)            1897728   
                                                                 
 bidirectional_2 (Bidirecti  (None, 70, 128)           66048     
 onal)                                                           
                                                                 
 lstm_5 (LSTM)               (None, 70, 64)            49408     
                                                                 
 time_distributed_2 (TimeDi  (None, 70, 17)            1105      
 stributed)                                                      
                                                                 
Total params: 2014289 (7.68 MB)
Trainable params: 2014289 (7.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


  0%|          | 0/25 [00:00<?, ?it/s]



  4%|▍         | 1/25 [00:41<16:34, 41.44s/it]



  8%|▊         | 2/25 [01:04<11:50, 30.89s/it]



 12%|█▏        | 3/25 [01:29<10:19, 28.16s/it]



 16%|█▌        | 4/25 [01:53<09:12, 26.30s/it]



 20%|██        | 5/25 [02:34<10:32, 31.63s/it]



 24%|██▍       | 6/25 [03:15<11:01, 34.84s/it]



 28%|██▊       | 7/25 [03:56<11:03, 36.87s/it]



 32%|███▏      | 8/25 [04:37<10:49, 38.21s/it]



 36%|███▌      | 9/25 [05:18<10:25, 39.10s/it]



 40%|████      | 10/25 [05:59<09:55, 39.72s/it]



 44%|████▍     | 11/25 [06:40<09:21, 40.14s/it]



 48%|████▊     | 12/25 [07:21<08:45, 40.42s/it]



 52%|█████▏    | 13/25 [07:45<07:02, 35.21s/it]



 56%|█████▌    | 14/25 [08:26<06:46, 36.99s/it]



 60%|██████    | 15/25 [09:07<06:22, 38.22s/it]



 64%|██████▍   | 16/25 [09:48<05:51, 39.08s/it]



 68%|██████▊   | 17/25 [10:29<05:17, 39.68s/it]



 72%|███████▏  | 18/25 [10:52<04:02, 34.65s/it]



 76%|███████▌  | 19/25 [11:18<03:11, 31.95s/it]



 80%|████████  | 20/25 [11:59<02:53, 34.69s/it]



 84%|████████▍ | 21/25 [12:22<02:05, 31.33s/it]



 88%|████████▊ | 22/25 [12:43<01:24, 28.11s/it]



 92%|█████████▏| 23/25 [13:03<00:51, 25.86s/it]



 96%|█████████▌| 24/25 [13:24<00:24, 24.35s/it]



100%|██████████| 25/25 [13:45<00:00, 33.01s/it]


# Testing the model

In [None]:
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is sid \n I am from United Kingdom \n I want to work with Google \n Steve Jobs is My Inspiration, seriously!')
displacy.render(text, style = 'ent', jupyter=True)