In [1]:
# IMPORT DATA
import pandas as pd
import numpy as np

path = 'C:/Users/test/Desktop/GL 2018.09.29/'
data = pd.read_csv(path + 'ner_dataset.csv', encoding='latin1')
data = data.fillna(method="ffill") # Deal with N/A

In [2]:
tags = list(set(data["POS"].values)) # Read POS values

In [3]:
tags # List of possible POS values

['VBG',
 'RBS',
 'DT',
 'MD',
 'PDT',
 'RP',
 '$',
 '``',
 'PRP',
 'RRB',
 'VBN',
 'VBP',
 'NN',
 ':',
 'CC',
 'NNPS',
 'UH',
 'IN',
 'RBR',
 'VBD',
 'NNP',
 'JJS',
 'WP',
 'RB',
 'WDT',
 'EX',
 ';',
 'LRB',
 'NNS',
 'VB',
 'POS',
 'PRP$',
 ',',
 'JJR',
 'WP$',
 'WRB',
 'FW',
 'JJ',
 'VBZ',
 'TO',
 '.',
 'CD']

In [4]:
words = list(set(data["Word"].values))
words.append("DUMMY") # Add a dummy word to pad sentences.

In [5]:
# Code to read sentences

class ReadSentences(object): 
    
    def __init__(self, data):
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [6]:
sentences = ReadSentences(data).sentences # Read all sentences

In [7]:
# Convert words and tags into numbers
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

In [8]:
# Prepare input and output data

from keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2id[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(words)-1)
y = [[tag2id[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2id["."])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
# Convert output to one-hot bit

from keras.utils import to_categorical
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [21]:
y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [22]:
# Training and test split by sentences

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20)

In [23]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [24]:
input = Input(shape=(max_len,)) # Input layer
model = Embedding(input_dim=len(words), output_dim=50, input_length=max_len)(input) # Word embedding layer
model = Dropout(0.1)(model) # Dropout
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) # Bi-directional LSTM layer
out = TimeDistributed(Dense(len(tags), activation="softmax"))(model)  # softmax output layer

In [25]:
model = Model(input, out) # Complete model

In [26]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) # Compile with an optimizer

In [27]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=3, validation_split=0.1, verbose=1) # Train

Train on 34530 samples, validate on 3837 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
# Demo test on one sample. See how it is mostly correct, but not 100%

i = 1213 # Some test sentence sample
p = model.predict(np.array([X_te[i]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_te[i], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(words[w], tags[pred])) # Print word and tag

Imports              -- NNS
of                   -- IN
the                  -- DT
types                -- NNS
of                   -- IN
watches              -- NNS
that                 -- IN
now                  -- RB
will                 -- MD
be                   -- VB
eligible             -- JJ
for                  -- IN
duty-free            -- JJ
treatment            -- NN
totaled              -- VBD
about                -- IN
$                    -- $
37.3                 -- CD
million              -- CD
in                   -- IN
1988                 -- CD
,                    -- ,
a                    -- DT
relatively           -- RB
small                -- JJ
share                -- NN
of                   -- IN
the                  -- DT
$                    -- $
1.5                  -- CD
billion              -- CD
in                   -- IN
U.S.                 -- NNP
watch                -- NN
imports              -- NNS
that                 -- WDT
year                 -- 

In [33]:
import nltk
from nltk import word_tokenize

sentence = nltk.word_tokenize('That was a nice jump')
X_Samp = pad_sequences(maxlen=max_len, sequences=[[word2id[word] for word in sentence]], padding="post", value=len(words)-1)

In [34]:
p = model.predict(np.array([X_Samp[0]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_Samp[0], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(words[w], tags[pred])) # Print word and tag

That                 -- DT
was                  -- VBD
a                    -- DT
nice                 -- JJ
jump                 -- NN
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY 