In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
from google.colab import files
uploaded = files.upload()

Saving ner_dataset.csv to ner_dataset.csv


In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [3]:
print("Unique words in corpus:", data['Word'].nunique())
print("Unique tags in corpus:", data['Tag'].nunique())

Unique words in corpus: 35178
Unique tags in corpus: 17


In [4]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)
num_words

35179

In [5]:
tags = list(set(data["Tag"].values))
num_tags = len(tags)
num_tags

17

In [6]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [8]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [9]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
word2idx

{'engages': 1,
 'Benishangul': 2,
 'magnet': 3,
 'Guajira': 4,
 'Dinesh': 5,
 '163': 6,
 'slashing': 7,
 'editions': 8,
 'residential': 9,
 'Common': 10,
 'Crisis': 11,
 'Quentin': 12,
 'Karzai': 13,
 'Bashar': 14,
 'Oliveira': 15,
 'Armenia': 16,
 'aides': 17,
 'bulldozers': 18,
 'Jabouri': 19,
 'Ipanema': 20,
 'stove': 21,
 'Fugitive': 22,
 'supersedes': 23,
 'impasse': 24,
 'Polish-born': 25,
 'designating': 26,
 'wood-processing': 27,
 'willingly': 28,
 'community-owned': 29,
 '3,00,000': 30,
 '295': 31,
 'anybody': 32,
 'Sunnis': 33,
 'township': 34,
 'Cauldron': 35,
 'conqueror': 36,
 'assessing': 37,
 'champion': 38,
 'reprisals': 39,
 'Clumsy': 40,
 'one-meter-tall': 41,
 'Stiff': 42,
 'rethink': 43,
 'reaching': 44,
 'Nyongwa': 45,
 'collapsing': 46,
 'Taksim': 47,
 'slake': 48,
 'Maskhadov': 49,
 '1,26,976': 50,
 'Armory': 51,
 'hopes': 52,
 'Rush': 53,
 'precaution': 54,
 'clandestine': 55,
 '4.7': 56,
 'heretics': 57,
 'higher-level': 58,
 'Ocean': 59,
 'Booking': 60,
 'rol

In [11]:

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=50, sequences=X, padding="post", value=num_words-1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=50, sequences=y, padding="post", value=tag2idx["O"])

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
input_word = Input(shape=(50,))
model = Embedding(input_dim=num_words, output_dim=50, input_length=50)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model = Model(input_word, out)
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1758950   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 17)           3417      
 ibuted)                                                         
                                                             

In [14]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

In [15]:
chkpt = ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')

early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=1, verbose=0, mode='max', baseline=None, restore_best_weights=False)

history = model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=32,epochs=1,verbose=1)



In [16]:
model.evaluate(x_test, y_test)



[0.06481640785932541, 0.9812281131744385]

In [17]:
i = np.random.randint(0, x_test.shape[0]) 
p = model.predict(np.array([x_test[i]]))
p = np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))

Word           True 	 Pred

------------------------------
The            O	O
statement      O	O
said           O	O
Ethiopia       B-geo	B-geo
had            O	O
long           O	O
observed       O	O
what           O	O
it             O	O
called         O	O
Qatar          B-org	B-org
's             O	O
"              O	O
hostile        O	O
behavior       O	O
"              O	O
and            O	O
had            O	O
been           O	O
patient        O	O
before         O	O
taking         O	O
Monday         B-tim	B-tim
's             O	O
measure        O	O
.              O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recalled       O	O
recal