In [6]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.utils import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split

In [7]:
data=pd.read_csv("ner_dataset.csv", encoding="latin1")
data=data.fillna(method="ffill")
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [8]:
print("Unique words in corpus:",data['Word'].nunique())
print("Unique tags in corpus:",data['Tag'].nunique())

Unique words in corpus: 35178
Unique tags in corpus: 17


In [9]:
words=list(set(data["Word"].values))
words.append("ENDPAD")
num_words=len(words)

In [10]:
tags=list(set(data["Tag"].values))
num_tags=len(tags)

In [11]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent=1
        self.data=data
        self.empty=False
        agg_func=lambda s:[(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped=self.data.groupby("Sentence #").apply(agg_func)
        self.sentences=[s for s in self.grouped]
    
    def get_next(self):
        try:
            s=self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent+=1
            return s
        except:
            return None

In [12]:
getter=SentenceGetter(data)
sentences=getter.sentences

In [13]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [14]:
word2idx={w:i + 1 for i, w in enumerate(words)}
tag2idx={t:i for i, t in enumerate(tags)}

In [15]:
word2idx

{'among': 1,
 'Dukeness': 2,
 'northern-based': 3,
 'Doaba': 4,
 'squares': 5,
 'Hospitals': 6,
 'upcoming': 7,
 'lending': 8,
 'drained': 9,
 'deploys': 10,
 'bullet': 11,
 'internally': 12,
 'ACC': 13,
 'Theo': 14,
 'Ehsanul': 15,
 'atrocity': 16,
 'vivanews.com': 17,
 'Boss': 18,
 'Aleem': 19,
 'aging': 20,
 'Creator': 21,
 'expand': 22,
 'Works': 23,
 "Sh'ite": 24,
 'columnist': 25,
 'coma': 26,
 'Stallone': 27,
 'civilian-to-civilian': 28,
 'Prudhoe': 29,
 'museum': 30,
 'Libyan': 31,
 'closures': 32,
 'human': 33,
 'richly': 34,
 'NIGHTINGALE': 35,
 'Itno': 36,
 'pill': 37,
 'Raad': 38,
 'liberal': 39,
 'conflict-resolution': 40,
 'Nielsen': 41,
 'dioxin': 42,
 'lawlessness': 43,
 'thirteenth': 44,
 'horticulture': 45,
 'double-digit': 46,
 'non-export': 47,
 'Community': 48,
 'nails': 49,
 'Eighteen': 50,
 'coincided': 51,
 'U.S.-proposed': 52,
 'BIN': 53,
 'payback': 54,
 'herders': 55,
 'practical': 56,
 'undeclared': 57,
 'Envious': 58,
 'AKAEV': 59,
 'Ben-Eliezer': 60,
 'tak

In [16]:

max_len=50
X=[[word2idx[w[0]] for w in s] for s in sentences]
X=pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
y=[[tag2idx[w[2]] for w in s] for s in sentences]
y=pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [17]:

x_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=1)

In [18]:
input_word=Input(shape=(max_len,))
model=Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_word)
model=SpatialDropout1D(0.1)(model)
model=Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out=TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model=Model(input_word, out)
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1758950   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 17)           3417      
 ibuted)                                                         
                                                             

In [19]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [20]:
chkpt=ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
early_stopping=EarlyStopping(monitor='val_accuracy', min_delta=0, patience=1, verbose=0, mode='max', baseline=None, restore_best_weights=False)
history=model.fit(x=x_train, y=y_train, validation_data=(x_test,y_test), batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
model.evaluate(x_test, y_test)



[2.8306338787078857, 0.014075062237679958]

In [26]:
i=np.random.randint(0, x_test.shape[0]) 
p=model.predict(np.array([x_test[i]]))
p=np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))

Word           True 	 Pred

------------------------------
Afghan         B-gpe	I-geo
officials      O	I-geo
say            O	I-geo
at             O	I-per
least          O	I-per
29             O	B-tim
suspected      O	I-per
Taleban        B-org	I-per
insurgents     O	I-per
have           O	I-per
been           O	I-per
killed         O	I-per
in             O	I-per
clashes        O	B-geo
with           O	B-geo
NATO           B-org	B-geo
and            O	I-per
Afghan         B-gpe	I-per
forces         O	I-per
in             O	I-per
two            O	I-per
southern       O	I-per
provinces      O	I-eve
.              O	I-eve
Ibos           O	I-per
Ibos           O	I-per
Ibos           O	I-per
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos           O	I-geo
Ibos 