In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
import nltk, pandas as pd
import numpy as np

# for file paths
from google.colab import userdata


# tf.debugging.set_log_device_placement(enabled=True)
# setup GPU
device_name = tf.test.gpu_device_name() if tf.test.gpu_device_name() else '/cpu:0'
device_name
# tf.device(device_name)

In [None]:
nltk.download('punkt')

In [35]:
# Sentence Getter Class
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
# Load Dataset
data = pd.read_csv(userdata.get('dataset'), encoding="latin1")
data = data.fillna(method="ffill")
data.head(5)

In [37]:
# Pre-process

total_w = len(list(data['Word'].values))
words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)
tags = list(set(data["Tag"].values))
num_tags = len(tags)

getter = SentenceGetter(data)
sentences = getter.sentences

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

max_len = 50

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words - 1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [38]:
# Splitting dataset

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# For GPU, creating constants
if tf.device(device_name) == '/device:GPU:0':
  x_train = tf.constant(x_train)
  x_test = tf.constant(x_test)
  y_train = tf.constant(y_train)
  y_test = tf.constant(y_test)

In [None]:
# Create Model

input_word = Input(shape=(max_len,))

model = Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True))(model)
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model = Model(inputs=input_word, outputs=out, name="NER_Model")

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.summary()

In [None]:
# Training Model

chkpt = ModelCheckpoint("model_weights.h5", monitor='val_loss', verbose=1, save_best_only=True,
                        save_weights_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=2, verbose=0,
                                mode='max', baseline=None, restore_best_weights=False)

callbacks = [chkpt, early_stopping]

batch_size = 256
epochs = 100

# reduce batch_size and epochs for cpu
if tf.device(device_name) != '/device:GPU:0':
  batch_size = 64
  epochs = 10

model.fit(x=x_train, y=y_train, validation_data=(x_test, y_test), shuffle=True,
          batch_size=batch_size, epochs=epochs, callbacks=callbacks, verbose=1)


In [None]:
# Evaluate model
model.evaluate(x_test, y_test)

In [108]:
# Save Model
if tf.device(device_name) != '/device:GPU:0':
  model.save(userdata.get('model_cpu'))
else:
  model.save(userdata.get('model_gpu'))

In [None]:
# Load model
import tensorflow as tf
from tensorflow.keras import models

model = None

if tf.device(device_name) != '/device:GPU:0':
  model = models.load_model(userdata.get('model_cpu'))
else:
  model = models.load_model(userdata.get('model_gpu'))

model.summary()

In [None]:
# save data
np.save(userdata.get('data'), (word2idx, max_len, tags))

In [152]:
# load data
word2idx, max_len, tags = np.load(userdata.get('data'), allow_pickle=True)

In [None]:
# Load sentence drom test data set
import re

index = 9000

tokens = []
for x in x_test[index]:
    if not words[x] == 'ENDPAD':
        tokens.append(words[x - 1] + ' ')
for j, word in enumerate(tokens):
    if re.findall(r'^[&().,?]', word) or (re.findall(r"^[']", word) and tokens[j - 1][-1] == ' '):
        tokens[j - 1] = tokens[j - 1][:-1]
sentence = ''.join(tokens)
sentence

In [None]:
# preprocess input sentence

# sentence = "The International Committee of the Red Cross helped organize the initiative, which is the first of its kind. "

sentence = nltk.word_tokenize(sentence)

x_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in sentence]],
                            padding="post", value=0, maxlen=max_len)


# Predicting
p = model.predict(np.array([x_sent[0]]))
p = np.argmax(p, axis=-1)

data = [[w, tags[p]] for w, p in zip(sentence, p[0])]

for (w, t) in data:
  print("{:15}: {:5}".format(w, t))

In [None]:
# Post process predictions

# grouping words e.g. B-gpe and next immediate I-gpe will be combined as one B-gpe
def group_words(l):
    group = [l[0]]
    for item in l[1:]:
        if 'I-' in item[1]:
            group[-1][0] += ' ' + item[0]
        else:
            group.append(item)
    return group

processed_data = []
for word, tag in group_words(data):
  word = word + ' '
  if tag == 'O':
    processed_data.append(word)
  elif tag == 'B-nat':
    processed_data.append((word, 'Nat'))
  elif tag == 'B-org':
    processed_data.append((word, 'Organization'))
  elif tag == 'B-art':
    processed_data.append((word, 'Art'))
  elif tag == 'B-tim':
    processed_data.append((word, 'Time'))
  elif tag == 'B-geo':
    processed_data.append((word, 'Location'))
  elif tag == 'B-eve':
    processed_data.append((word, 'Event'))
  elif tag == 'B-gpe':
    processed_data.append((word, 'Geo-Political'))
  elif tag == 'B-per':
    processed_data.append((word, 'Person'))

# display final results
processed_data