In [1]:
from math import nan
from future.utils import iteritems
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
#from utils import get_data, generate_output, guess_human, seed_sequence, get_embeddings, find_closest
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
data = pd.read_csv("ner_dataset.csv", encoding= 'unicode_escape') 
data.head(672)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
667,,Muslim,NNP,B-org
668,,Brotherhood,NNP,I-org
669,,as,IN,O
670,,parts,NNS,O


In [3]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
          
getter = SentenceGetter(data)
sentences = getter.sentences

In [4]:
words = list(set(data["Word"].values))
nrofWords = len(words)

ner_tags = []
for tag in set(data["Tag"].values):
    if tag is nan or isinstance(tag, float):
        ner_tags.append('unk')
    else:
        ner_tags.append(tag)
nrofTags = len(ner_tags)
print(ner_tags)

['B-eve', 'B-per', 'B-nat', 'I-tim', 'I-eve', 'I-org', 'B-art', 'I-per', 'B-gpe', 'I-gpe', 'O', 'B-geo', 'I-geo', 'I-art', 'B-org', 'I-nat', 'B-tim']


In [5]:
index_word = {w: i for i, w in enumerate(words)}
index_tag = {t: i for i, t in enumerate(ner_tags)}
idx2tag = {v: k for k, v in iteritems(index_tag)}

In [7]:
maxlen = max([len(s) for s in sentences])
word_embedding_size = 17

X = [[index_word[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=nrofWords - 1)

y = [[index_tag[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=index_tag["O"])
y = [to_categorical(i, num_classes=nrofTags) for i in y]

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Simple RNN model

In [19]:
input = Input(shape=(maxlen,))

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=nrofWords+1, output_dim=32, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(32, return_sequences=True)),
    tf.keras.layers.Dense(nrofTags, activation='tanh'),
    tf.keras.layers.Dropout(0.5),
])



In [20]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=30,
                    validation_split=0.5, verbose=1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 32)             1125728   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1, 64)             4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1, 17)             1105      
_________________________________________________________________
dropout_2 (Dropout)          (None, 1, 17)             0         
Total params: 1,130,993
Trainable params: 1,130,993
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30

In [21]:
test_pred = model.predict(X_test, verbose=1)

idx2tag = {i: w for w, i in index_tag.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)



In [22]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))
print(classification_report(test_labels, pred_labels))

F1-score: 73.2%
           precision    recall  f1-score   support

      org       0.44      0.56      0.50       547
      per       0.77      0.79      0.78       828
      geo       0.76      0.82      0.79       678
      gpe       0.95      0.89      0.92       605
      tim       0.00      0.00      0.00        97
      eve       0.00      0.00      0.00         5
      art       0.00      0.00      0.00         6
      nat       0.00      0.00      0.00         3

micro avg       0.72      0.74      0.73      2769
macro avg       0.71      0.74      0.72      2769



# LSTM model

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=nrofWords+1, output_dim=word_embedding_size, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Dense(nrofTags, activation='tanh'),
    tf.keras.layers.Dropout(0.5),
])

In [13]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 17)             598043    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1, 128)            41984     
_________________________________________________________________
dense_1 (Dense)              (None, 1, 17)             2193      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 17)             0         
Total params: 642,220
Trainable params: 642,220
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=30,
                    validation_split=0.5, verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [16]:
test_pred = model.predict(X_test, verbose=1)

idx2tag = {i: w for w, i in index_tag.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)
print(pred_labels)


[['O'], ['O'], ['O'], ['B-geo'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-per'], ['B-per'], ['O'], ['B-gpe'], ['B-gpe'], ['O'], ['B-per'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-per'], ['B-geo'], ['B-geo'], ['O'], ['B-gpe'], ['O'], ['B-per'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-org'], ['O'], ['O'], ['O'], ['B-per'], ['B-per'], ['O'], ['O'], ['O'], ['O'], ['B-gpe'], ['O'], ['B-geo'], ['O'], ['B-gpe'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-gpe'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-geo'], ['O'], ['O'], ['O'], ['O'], ['O'], ['B-per'], ['B-geo'], ['O'], ['O'], ['O'], ['O'], ['B-org'], ['O'], ['B-org'], ['O'], ['O'], ['B-gpe'], ['O'], ['O'], ['O'], ['O'], ['B-org'], ['B-gpe'], ['O'], ['O'], ['O'], ['O'], ['B-per'], ['B-per'], ['O'], ['B-geo'], ['B-gpe'], ['O'], ['O'], ['O'], ['O'], ['B-geo'], ['O'], ['O'], 

In [17]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

F1-score: 70.1%


In [18]:
print(classification_report(test_labels, pred_labels))

           precision    recall  f1-score   support

      org       0.51      0.39      0.44       547
      per       0.71      0.82      0.76       828
      geo       0.69      0.84      0.76       678
      gpe       0.73      0.89      0.80       605
      tim       0.00      0.00      0.00        97
      eve       0.00      0.00      0.00         5
      art       0.00      0.00      0.00         6
      nat       0.00      0.00      0.00         3

micro avg       0.68      0.72      0.70      2769
macro avg       0.64      0.72      0.68      2769

