In [3]:
import numpy as np

In [169]:
from keras import backend as K
from keras.preprocessing import sequence
from keras.models import Model, load_model
from keras.regularizers import l2
from keras.layers.wrappers import TimeDistributed
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD

### Generate Example Data

Create 1000 sequences of length 20 randomly alternating between a 1 and a 2.

In [34]:
ex_data = np.random.randint(low=1, high=3., size=(1000,30))

In [35]:
ex_data[0,:]

array([1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2,
       2, 1, 2, 1, 1, 1, 1])

For every sequence, label it in one of three classes based on the number of consecutive 1's

In [36]:
def label_sequence(sequence):
    """
    Read through each sequence and label it class [0,1,0] if the sequence has two consecutive 1's, class [0,0,1]
    if it has three consecutive 1's, and class [0,0,1] otherwise.
    """
    labels = list()
    previous_i = 0
    previous_two_i = 0
    for i in sequence:
        if i == 1 and previous_i == 1 and previous_two_i == 1:
            labels.append(np.array([0,0,1]))
        elif i == 1 and previous_i == 1:
            labels.append(np.array([0,1,0]))
        else:
            labels.append(np.array([1,0,0]))
        previous_two_i = previous_i
        previous_i = i
    return labels

In [37]:
ex_data_labels = np.array([label_sequence(ex_data[i,:]) for i in range(ex_data.shape[0])])

In [38]:
ex_data_labels[0,:,:]

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

### Toy Example Model 

In [39]:
MAX_LEN = ex_data.shape[1]
HIDDEN_UNITS = 4
N_SYMBOLS = 2
EMBED_SIZE = 1
NUM_LABELS = 3

In [40]:
def get_model():
    input_data = Input(shape=(MAX_LEN, ))
    x = Embedding(input_dim=N_SYMBOLS, output_dim=EMBED_SIZE, input_length=MAX_LEN)(input_data)
    x = LSTM(units=HIDDEN_UNITS, return_sequences=True)(x)
    output = TimeDistributed(Dense(NUM_LABELS, activation='softmax'))(x)
    
    model = Model(inputs=[input_data], outputs=[output])
    
    opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

In [41]:
model = get_model()

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 1)             2         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 4)             96        
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 3)             15        
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.fit(x=ex_data, y=ex_data_labels, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fec6045f9e8>

In [45]:
ex_data[0:2,:]

array([[1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2,
        2, 2, 1, 2, 1, 1, 1, 1],
       [1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 1, 1, 2, 2]])

In [46]:
model.predict([ex_data[0:2,:]])

array([[[  6.79882169e-01,   2.39737630e-01,   8.03801790e-02],
        [  1.17834374e-01,   7.39907563e-01,   1.42258123e-01],
        [  2.69769393e-02,   1.24302827e-01,   8.48720193e-01],
        [  1.60899758e-02,   3.16763073e-02,   9.52233732e-01],
        [  9.91118491e-01,   5.62047411e-04,   8.31940956e-03],
        [  9.99995470e-01,   2.50739595e-06,   2.05002630e-06],
        [  9.99987721e-01,   1.08899376e-05,   1.42583451e-06],
        [  9.99981046e-01,   1.70224448e-05,   1.87679780e-06],
        [  9.92020309e-01,   7.97460414e-03,   5.12205770e-06],
        [  9.99899983e-01,   8.01646529e-05,   1.97743266e-05],
        [  9.99993443e-01,   4.92478739e-06,   1.71398028e-06],
        [  9.96875405e-01,   3.12279840e-03,   1.77940819e-06],
        [  9.99870539e-01,   1.13919363e-04,   1.54884456e-05],
        [  9.99992728e-01,   5.32445802e-06,   1.85208705e-06],
        [  9.96808112e-01,   3.19010811e-03,   1.82513270e-06],
        [  9.99872088e-01,   1.12369969e

### Text Data Example

Data here: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data

In [47]:
import pandas as pd

In [61]:
df = pd.read_csv('../data/entity-annotated-corpus/ner_dataset.csv', encoding='latin-1')

In [62]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [63]:
df.shape

(1048575, 4)

In [64]:
df['word_lower'] = df['Word'].apply(lambda x: x.lower())

In [66]:
word_vocab = list(set(df.word_lower.tolist()))

In [69]:
hash_to_word = {i:w for i,w, in enumerate(word_vocab)}
word_to_hash = {w:i for i,w in hash_to_word.items()}

In [71]:
df['word_hash'] = df['word_lower'].apply(lambda x: word_to_hash[x])

In [74]:
pos_label_dict = {'NN':np.array([1,0,0,0,0,0,0,0,0,0]),
                  'NNP':np.array([0,1,0,0,0,0,0,0,0,0]),
                  'IN':np.array([0,0,1,0,0,0,0,0,0,0]),
                  'DT':np.array([0,0,0,1,0,0,0,0,0,0]),
                  'JJ':np.array([0,0,0,0,1,0,0,0,0,0]),
                  'NNS':np.array([0,0,0,0,0,1,0,0,0,0]),
                  '.':np.array([0,0,0,0,0,0,1,0,0,0]),
                  ',':np.array([0,0,0,0,0,0,1,0,0,0]),
                  '``':np.array([0,0,0,0,0,0,1,0,0,0]),
                  '$':np.array([0,0,0,0,0,0,1,0,0,0]),
                  ':':np.array([0,0,0,0,0,0,1,0,0,0]),
                  ';':np.array([0,0,0,0,0,0,1,0,0,0]),
                  'VBD':np.array([0,0,0,0,0,0,0,1,0,0]),
                  'VBN':np.array([0,0,0,0,0,0,0,0,1,0])}

In [75]:
df['pos_label'] = df.POS.apply(lambda x: pos_label_dict.get(x,np.array([0,0,0,0,0,0,0,0,0,1])))

In [78]:
df.index.max()

1048574

In [127]:
TRAIN_SAMPLE_SIZE = 20000
VAL_SAMPLE_SIZE = 1500
TEST_SAMPLE_SIZE = 500
SEQUENCE_LENGTH = 50
VAL_START_INDEX = 850000
TEST_INDEX_START = 900000
N_CLASSES = 10

def generate_train_val_test(df):
    X_train = list()
    y_train = list()
    X_val = list()
    y_val = list()
    
    for train_n in range(TRAIN_SAMPLE_SIZE):
        random_start = np.random.randint(low=0, high=VAL_START_INDEX-SEQUENCE_LENGTH, size=1)[0]
        X_train.append(np.array(df.loc[random_start:random_start+SEQUENCE_LENGTH-1]['word_hash'].tolist()))
        y_train.append(np.vstack(df.loc[random_start:random_start+SEQUENCE_LENGTH-1]['pos_label'].tolist()))
        
    for val_n in range(VAL_SAMPLE_SIZE):
        random_start = np.random.randint(low=VAL_START_INDEX, high=TEST_INDEX_START-SEQUENCE_LENGTH, size=1)[0]
        X_val.append(np.array(df.loc[random_start:random_start+SEQUENCE_LENGTH-1]['word_hash'].tolist()))
        y_val.append(np.vstack(df.loc[random_start:random_start+SEQUENCE_LENGTH-1]['pos_label'].tolist()))
    
    return np.vstack(X_train), np.reshape(np.vstack(y_train), (TRAIN_SAMPLE_SIZE,SEQUENCE_LENGTH,N_CLASSES)), np.vstack(X_val), np.reshape(np.vstack(y_val), (VAL_SAMPLE_SIZE,SEQUENCE_LENGTH,N_CLASSES)),

In [128]:
X_train, y_train, X_val, y_val = generate_train_val_test(df)

In [129]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(20000, 50)
(20000, 50, 10)
(1500, 50)
(1500, 50, 10)


### Train a Model

In [168]:
MAX_LEN = X_train.shape[1]
HIDDEN_UNITS = 32
N_SYMBOLS = len(word_to_hash)
EMBED_SIZE = 64
NUM_LABELS = 10

In [170]:
def get_model():
    input_data = Input(shape=(MAX_LEN, ))
    x = Embedding(input_dim=N_SYMBOLS, output_dim=EMBED_SIZE, input_length=MAX_LEN)(input_data)
    x = Dropout(0.5)(x)
    x = Bidirectional(LSTM(units=HIDDEN_UNITS, return_sequences=True, kernel_regularizer=l2(l=0.001)))(x)
    output = TimeDistributed(Dense(NUM_LABELS, activation='softmax'))(x)
    
    model = Model(inputs=[input_data], outputs=[output])
    
    opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

In [171]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 50, 64)            2036288   
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 64)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 50, 64)            24832     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 50, 10)            650       
Total params: 2,061,770
Trainable params: 2,061,770
Non-trainable params: 0
_________________________________________________________________


In [172]:
EPOCHS = 10
FILE_PATH = "models/ner_model_weights.hdf5"
checkpoint = ModelCheckpoint(FILE_PATH, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=15)
callbacks_list = [checkpoint, early]

In [173]:
model.fit(x=X_train, y=y_train, epochs=EPOCHS, validation_data=[X_val, y_val], callbacks=callbacks_list)

Train on 20000 samples, validate on 1500 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.17977, saving model to models/ner_model_weights.hdf5
Epoch 2/10
Epoch 00002: val_loss improved from 0.17977 to 0.13381, saving model to models/ner_model_weights.hdf5
Epoch 3/10
Epoch 00003: val_loss improved from 0.13381 to 0.12205, saving model to models/ner_model_weights.hdf5
Epoch 4/10
Epoch 00004: val_loss improved from 0.12205 to 0.11871, saving model to models/ner_model_weights.hdf5
Epoch 5/10
Epoch 00005: val_loss improved from 0.11871 to 0.11786, saving model to models/ner_model_weights.hdf5
Epoch 6/10
Epoch 00006: val_loss improved from 0.11786 to 0.11736, saving model to models/ner_model_weights.hdf5
Epoch 7/10
Epoch 00007: val_loss improved from 0.11736 to 0.11633, saving model to models/ner_model_weights.hdf5
Epoch 8/10
Epoch 00008: val_loss did not improve
Epoch 9/10
Epoch 00009: val_loss did not improve
Epoch 10/10
Epoch 00010: val_loss did not improve


<keras.callbacks.History at 0x7febdeaa2860>

In [174]:
model = load_model(FILE_PATH)

In [175]:
def plain_text_to_sequence(text_string):
    text_split = text_string.replace("."," .").split()
    text_hash = [word_to_hash.get(w.lower(),0) for w in text_split]
    text_pad = sequence.pad_sequences([text_hash], maxlen=50)
    return text_pad

label_pos_dict = {0:'NN',
                  1:'NNP',
                  2:'IN',
                  3:'DT',
                  4:'JJ',
                  5:'NNS',
                  6:'PUNCT',
                  7:'VBD',
                  8:'VBN',
                  9:'OTHER'}

In [176]:
plain_text_to_sequence("The dog ran across the street.")

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0, 12343, 14984,
        28149, 26507, 12343, 10504, 10663]], dtype=int32)

In [179]:
y_hat = np.argmax(model.predict(plain_text_to_sequence("The dog ran across the street."))[0,:,:], axis=1)

In [180]:
[label_pos_dict.get(x,"NONE") for x in y_hat]

['JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'JJ',
 'VBD',
 'DT',
 'NN',
 'VBD',
 'IN',
 'DT',
 'NN',
 'PUNCT']