In [82]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [83]:
df = pd.read_csv('./ner_dataset.csv', encoding='latin1')
df = df.fillna(method='ffill')
words = list(set(df['Word'].values))
words.append('endpad')
tags = list(set(df['Tag'].values))
len(words), len(tags)

(35179, 17)

In [84]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
grouped = df.groupby("Sentence #").apply(agg_func)
sentences = [s for s in grouped]
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [85]:
max_len = max([len(sentence) for sentence in sentences])
max_len

104

In [86]:
wordtoidx = {word:idx for idx,word in enumerate(words)}
tagtoidx = {tag:idx for idx,tag in enumerate(tags)}

In [48]:
X = [[wordtoidx[word[0]] for word in sentence] for sentence in sentences]
Y = [[tagtoidx[word[2]] for word in sentence] for sentence in sentences]

In [26]:
X = pad_sequences(sequences=X, maxlen=max_len, padding='post', value=len(words)-1)
Y = pad_sequences(sequences=Y, maxlen=max_len, padding='post', value=tagtoidx["O"])

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1)
len(X_train), len(Y_train)
X_test[2]

array([18138, 21441, 30502, 23536, 27650, 18086, 22918, 24158, 32529,
       21745, 22640, 21974, 29234, 22639,  2642, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
       35178, 35178, 35178, 35178, 35178], dtype=int32)

In [29]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

In [31]:
# creating the model
input_data = Input(shape=(max_len,))
model = Embedding(input_dim=len(words), output_dim=50, input_length=max_len)(input_data)
model = SpatialDropout1D(rate=0.1)(model)
model = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(units=len(tags), activation='softmax'))(model)
model = Model(input_data, out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 104)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 104, 50)           1758950   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 104, 50)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 104, 256)          183296    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 104, 17)           4369      
Total params: 1,946,615
Trainable params: 1,946,615
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(optimizer='adam', metrics=['accuracy'], loss='sparse_categorical_crossentropy')

In [33]:
model.fit(
    x=X_train,
    y=Y_train,
    batch_size=32,
    validation_data=(X_test,Y_test),
    epochs=3,
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7ff0c0996b50>

In [34]:
model.save('./')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./assets


In [77]:
model.evaluate(X_test,Y_test)



[0.022176483646035194, 0.993327796459198]

In [124]:
# Preprocessing User Input
def word_to_idx(words_predict):
    words_idx = np.full((max_len,), len(words)-1)
    i = 0
    for w in words_predict:
        words_idx[i] = wordtoidx[w]
        i += 1
    return words_idx

input_sentence = "I want to fly in an Airbus. I am planning a trip to London"
words_predict = list(set(word_tokenize(input_sentence)))
x_predict = word_to_idx(words_predict)
p = model.predict(np.array(x_predict))
p = np.argmax(p, axis=-1)
for i in range(len(p)):
    print("{} - {}".format(words[x_predict[i]], tags[p[i][0]]))

in - O
am - O
trip - O
want - O
I - I-tim
an - O
fly - O
Airbus - I-org
a - O
to - O
London - I-geo
. - O
planning - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
endpad - O
e