In [1]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, Model, Input
from keras.layers import GRU, Activation, Dense, Embedding, Dropout

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


In [2]:
import os
os.environ['KAGGLE_USERNAME'] = "XXXXXXXXX"
os.environ['KAGGLE_KEY'] = "XXXXXXX"
!kaggle datasets download abhinavwalia95/entity-annotated-corpus

entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!unzip -o entity-annotated-corpus.zip

Archive:  entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [0]:
df = pd.read_csv("ner_dataset.csv", encoding = 'unicode_escape')

In [5]:
df.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [0]:
df.rename({'Sentence #':'Sentence_ID'}, axis=1, inplace=True)

In [0]:
df['Sentence_ID'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [0]:
# Fix missing Sentence IDs with forward fill.
df['Sentence_ID'] = df['Sentence_ID'].ffill()

In [0]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.dataset = dataset
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("Sentence_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [0]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [11]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 104


In [12]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
len(words)

35179

In [13]:
tags = list(set(df["Tag"].values))
len(tags)

17

In [0]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [0]:
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [0]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [0]:
X = pad_sequences(maxlen=140, sequences=X, padding="post", value = 35178)
y = pad_sequences(maxlen=140, sequences=y, padding="post", value = tag2idx["O"])

In [18]:
X.shape, y.shape

((47959, 140), (47959, 140))

In [0]:
y = [to_categorical(i, num_classes=17) for i in y]

#### X_train, X_test, y_train, y_test

In [0]:
X_train = X[:35970]
X_test = X[35970:]
y_train = y[:35970]
y_test = y[35970:]

In [21]:
print(X_train.shape[0] + X_test.shape[0])
print(len(y_train) + len(y_test))

47959
47959


In [0]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [23]:
vocabulary_size = len(words)
embedding_dim = 100  

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_shape=(140,)))
model.add(GRU(50, return_sequences = True))
model.add(Dropout(0.2))
model.add(GRU(50, return_sequences = True))
model.add(Dropout(0.2))
model.add(Dense(17, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1_score])

W0616 19:10:37.890957 140628765251456 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0616 19:10:37.906188 140628765251456 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0616 19:10:37.909161 140628765251456 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0616 19:10:38.115173 140628765251456 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0616 19:10:38.125272 

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 100)          3517900   
_________________________________________________________________
gru_1 (GRU)                  (None, 140, 50)           22650     
_________________________________________________________________
dropout_1 (Dropout)          (None, 140, 50)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 140, 50)           15150     
_________________________________________________________________
dropout_2 (Dropout)          (None, 140, 50)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 140, 17)           867       
Total params: 3,556,567
Trainable params: 3,556,567
Non-trainable params: 0
_________________________________________________________________


In [25]:
%time
model.fit(X_train, np.array(y_train), batch_size = 32, epochs = 1, validation_data=(X_test, np.array(y_test)))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs


W0616 19:10:38.914417 140628765251456 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 35970 samples, validate on 11989 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe669317240>

In [26]:
scores = model.evaluate(X_test, np.array(y_test))
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 99.26%


In [0]:
#from sklearn.metrics import confusion_matrix
#predictions = model.predict(X_test)
#y_pred = np.argmax(predictions, axis=-1)
#matrix = confusion_matrix(np.array(y_test).argmax(axis=1), y_pred)
#matrix

## Predictions

In [45]:
i = 4
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): {}".format("Word", "True", "Pred"))
for w,pred in zip(X_test[i],p[0]):
  word = words[int(w)]
  tag = tags[pred]
  print(word, tag)

Word           (True ): Pred
Police O
in O
Zimbabwe B-geo
have O
arrested O
a O
nephew O
of O
President B-per
Robert I-per
Mugabe I-per
on O
suspicion O
of O
smuggling O
30 O
tons O
of O
scarce O
flour O
to O
neighboring O
Mozambique B-geo
. O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O
ENDPAD O


In [29]:
sentences[35974]

[('Police', 'O'),
 ('in', 'O'),
 ('Zimbabwe', 'B-geo'),
 ('have', 'O'),
 ('arrested', 'O'),
 ('a', 'O'),
 ('nephew', 'O'),
 ('of', 'O'),
 ('President', 'B-per'),
 ('Robert', 'I-per'),
 ('Mugabe', 'I-per'),
 ('on', 'O'),
 ('suspicion', 'O'),
 ('of', 'O'),
 ('smuggling', 'O'),
 ('30', 'O'),
 ('tons', 'O'),
 ('of', 'O'),
 ('scarce', 'O'),
 ('flour', 'O'),
 ('to', 'O'),
 ('neighboring', 'O'),
 ('Mozambique', 'B-geo'),
 ('.', 'O')]

In [0]:
test_array = ["The programmers from Barcelona might write a sentence without a spell checker.",
              "The programmers from Barchelona cannot write a sentence without a spell checker.",
              "Jack London went to Parris.",
              "Jack London went to Paris.",
              "We never though Microsoft would become such a big company.",
              "We never though Microsof would become such a big company.",
              "The president of U.S.A though they could win the war",
              "The president of the United States of America though they could win the war",
              "The king of Saudi Arabia wanted total control.",
              "Robin does not want to go to Saudi Arabia."]

In [39]:
testt = [word2idx[w] for w in word_tokenize(test_array[])]
padded = pad_sequences(maxlen=140, sequences=[testt], padding="post", value = 35178)
p = model.predict(padded)
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): {}".format("Word", "True", "Pred"))
for w,pred in zip(np.array(testt),p[0]):
  word = words[int(w)]
  tag = tags[pred]
  print(word, tag)

Word           (True ): Pred
Robin B-per
does O
not O
want O
to O
go O
to O
Saudi B-org
Arabia I-geo
. O


In [56]:
df.query('Word == "Microsoft"')

Unnamed: 0,Sentence_ID,Word,POS,Tag
62485,2828,Microsoft,NNP,B-org
62577,2831,Microsoft,NNP,B-org
98283,4471,Microsoft,NNP,B-org
214596,9833,Microsoft,NNP,B-org
218799,10021,Microsoft,NNP,B-org
220664,10103,Microsoft,NNP,B-org
238391,10917,Microsoft,NNP,B-org
294123,13471,Microsoft,NNP,B-org
294153,13472,Microsoft,NNP,B-org
294192,13474,Microsoft,NNP,B-org
