In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, Model, Input
from keras.layers import GRU, Activation, Flatten, Dense

Using TensorFlow backend.


In [2]:
df = pd.read_csv("entity-annotated-corpus/ner_dataset.csv", encoding = 'unicode_escape')

In [3]:
df.head(25)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [4]:
df.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [5]:
df.rename({'Sentence #':'Sentence_ID'}, axis=1, inplace=True)

In [6]:
df['Sentence_ID'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [7]:
# Fix missing Sentence IDs with forward fill.
df['Sentence_ID'] = df['Sentence_ID'].ffill()

In [8]:
#training_df = df.iloc[:786878]
#test_df = df.iloc[786878:]

#test_set = df['2017':].iloc[:,1:2].values

In [9]:
# Last sentence in training DF must be 35970
#training_df.iloc[-1:]

In [10]:
# Last sentence in testing DF must be 47959
#test_df.iloc[-1:]

In [11]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.dataset = dataset
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("Sentence_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [12]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [13]:
print(sentences[0])

[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [14]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 104


In [15]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
len(words)

35179

In [16]:
tags = list(set(df["Tag"].values))
len(tags)

17

In [17]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [18]:
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [19]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [20]:
X = pad_sequences(maxlen=140, sequences=X, padding="post",value = 35178)
y = pad_sequences(maxlen=140, sequences=y, padding="post", value = tag2idx["O"])

In [21]:
X.shape, y.shape

((47959, 140), (47959, 140))

In [22]:
y = [to_categorical(i, num_classes=17) for i in y]

In [23]:
len(y)

47959

#### X_train, X_test, y_train, y_test

In [24]:
X_train = X[:35970]
X_test = X[35970:]
y_train = y[:35970]
y_test = y[35970:]

In [25]:
print(X_train.shape[0] + X_test.shape[0])
print(len(y_train) + len(y_test))

47959
47959


In [26]:
X_train = X_train.reshape(X_train.shape + (1,))
X_test = X_test.reshape(X_test.shape + (1,))

In [27]:
X_train.shape

(35970, 140, 1)

In [28]:
model = Sequential()
model.add(GRU(50, input_shape = (140, 1), return_sequences = True))
model.add(Dense(17, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 140, 50)           7800      
_________________________________________________________________
activation_1 (Activation)    (None, 140, 50)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 140, 17)           867       
Total params: 8,667
Trainable params: 8,667
Non-trainable params: 0
_________________________________________________________________


In [30]:
%time
model.fit(X_train, np.array(y_train), batch_size = 32, epochs = 1, verbose = 2)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 271 µs
Epoch 1/1


KeyboardInterrupt: 

In [None]:
scores = model.evaluate(X_test, y_test, verbose=2)
print("Accuracy: %.2f%%" % (scores[1]*100))