In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/ner_dataset.csv', encoding='latin1')
data = data.fillna(method='ffill')

In [2]:
class SentenceGetter:
    def __init__(self, data):
        self.pos = 1
        self.data = data
        self.empty = False
        
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s.Word.tolist(),
                                                          s.POS.tolist(),
                                                          s.Tag.tolist())]
        
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def reset(self):
        self.pos = 1
        
    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.pos)]
            self.pos += 1
            return s
        except:
            return None

In [3]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [4]:
words = set(data.Word.tolist())
n_words = len(words)

tags = set(data.Tag.tolist())
n_tags = len(tags)

print('words: {n_words} tags: {n_tags}'.format(n_words=n_words, n_tags=n_tags))

words: 35178 tags: 17


In [5]:
getter = SentenceGetter(data)

In [6]:
sentences = getter.sentences
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [76]:
max_len = 75
max_len_char = 10

In [77]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [78]:
chars = set([c for w in words for c in w])
n_chars = len(chars)
char2idx = {c:i+2 for i, c in enumerate(chars)}
char2idx['PAD'] = 0
char2idx['UNK'] = 1

In [79]:
X_char = []
for sentence in sentences:
    sent_sequence = []
    for i in range(max_len):
        char_sequence = []
        for j in range(max_len_char):
            try:
                char_sequence.append(char2idx[sentence[i][0][j]])
            except:
                char_sequence.append(char2idx['PAD'])
        sent_sequence.append(char_sequence)   
    X_char.append(np.array(sent_sequence))

In [80]:
X_char[0]

array([[44, 60, 40, 80, 14, 88, 56, 71, 14,  0],
       [40, 85,  0,  0,  0,  0,  0,  0,  0,  0],
       [71, 27, 76, 40, 56, 14, 67, 92, 88, 67],
       [60, 88, 95, 27,  0,  0,  0,  0,  0,  0],
       [76, 88, 92,  3, 60, 27, 71,  0,  0,  0],
       [67, 60, 92, 40, 80, 34, 60,  0,  0,  0],
       [51, 40, 56, 71, 40, 56,  0,  0,  0,  0],
       [67, 40,  0,  0,  0,  0,  0,  0,  0,  0],
       [84, 92, 40, 67, 27, 14, 67,  0,  0,  0],
       [67, 60, 27,  0,  0,  0,  0,  0,  0,  0],
       [ 7, 88, 92,  0,  0,  0,  0,  0,  0,  0],
       [43, 56,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 92, 88, 53,  0,  0,  0,  0,  0,  0],
       [88, 56, 71,  0,  0,  0,  0,  0,  0,  0],
       [71, 27, 76, 88, 56, 71,  0,  0,  0,  0],
       [67, 60, 27,  0,  0,  0,  0,  0,  0,  0],
       [ 7, 43, 67, 60, 71, 92, 88,  7, 88, 86],
       [40, 85,  0,  0,  0,  0,  0,  0,  0,  0],
       [35, 92, 43, 67, 43, 14, 60,  0,  0,  0],
       [67, 92, 40, 40, 84, 14,  0,  0,  0,  0],
       [85, 92, 40, 

In [81]:
from keras.preprocessing.sequence import pad_sequences

In [142]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, 
                  sequences=X, 
                  padding='post', 
                  value=word2idx['PAD'], 
                  truncating='post')

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, 
                  sequences=y, 
                  value=tag2idx['PAD'], 
                  padding='post',
                  truncating='post')

In [143]:
y[0]

array([ 2,  2,  2,  2,  2,  2, 14,  2,  2,  2,  2,  2, 14,  2,  2,  2,  2,
        2, 11,  2,  2,  2,  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [145]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags+1) for i in y]

In [146]:
y[0]

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [147]:
from sklearn.model_selection import train_test_split

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.1, 
                                                    random_state=2018)

X_char_train, X_char_test, y_char_train, y_char_test = train_test_split(X_char,
                                                                       y,
                                                                       test_size=0.1,
                                                                       random_state=2018)

In [149]:
type(X_train)

numpy.ndarray

In [225]:
X_char_test[0]

array([[44, 60, 27, 47,  0,  0,  0,  0,  0,  0],
       [14, 88, 47,  0,  0,  0,  0,  0,  0,  0],
       [67, 60, 27,  0,  0,  0,  0,  0,  0,  0],
       [71, 40,  3, 80, 76, 27, 56, 67, 14,  0],
       [ 7, 27, 92, 27,  0,  0,  0,  0,  0,  0],
       [85, 40, 80, 56, 71,  0,  0,  0,  0,  0],
       [43, 56,  0,  0,  0,  0,  0,  0,  0,  0],
       [67, 60, 27,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 40, 76, 84, 80, 67, 27, 92,  0,  0],
       [40, 85,  0,  0,  0,  0,  0,  0,  0,  0],
       [50, 72, 61, 33,  0,  0,  0,  0,  0,  0],
       [86, 27, 88, 71, 27, 92,  0,  0,  0,  0],
       [61, 88, 80, 86,  0,  0,  0,  0,  0,  0],
       [61, 27, 47, 27, 14,  0,  0,  0,  0,  0],
       [71, 80, 92, 43, 56, 34,  0,  0,  0,  0],
       [88,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 92, 40, 14, 14, 42,  4, 40, 92, 71],
       [92, 88, 43, 71,  0,  0,  0,  0,  0,  0],
       [43, 56,  0,  0,  0,  0,  0,  0,  0,  0],
       [55,  3, 80, 88, 71, 40, 92,  0,  0,  0],
       [86, 88, 14, 

In [151]:
np.array(y_train)[0]

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [152]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, concatenate
from keras_contrib.layers import CRF
import keras_metrics

In [227]:
# word_in = Input(shape=(max_len,), 
#                name='word_in')
# word_emb = Embedding(input_dim=n_words+2, 
#                      output_dim=20, 
#                      input_length=max_len,
#                      mask_zero=True, 
#                      name='word_embedding')(word_in)

# char_in = Input(shape=(max_len, 
#                        max_len_char,),
#                name='char_in')
# char_emb = TimeDistributed(Embedding(input_dim=n_chars+2,
#                                      output_dim=10,
#                                      input_length=max_len_char,
#                                      mask_zero=True,
#                                      name='char_embedding'))(char_in)

# char_encoder = TimeDistributed(LSTM(units=20, 
#                                     return_sequences=False,
#                                     recurrent_dropout=0.5,
#                                     name='char_encoder'))(char_emb)

# merged_input = concatenate([word_emb, char_encoder])
# model = Bidirectional(LSTM(units=50, 
#                            return_sequences=True, 
#                            recurrent_dropout=0.6,
#                            name='main_lstm'))(merged_input)

# out = TimeDistributed(Dense(n_tags+1, activation='softmax', name='output'))(model)

#model = Model(word_in, out)

word_input = Input(shape=(max_len,), 
              name='word_input')

word_embedding = Embedding(input_dim=n_words+2,
                      output_dim=50, 
                      input_length=max_len, 
                      name='word_embedding')(word_input)

word_dropout = Dropout(0.1, name='word_dropout')(word_embedding)

char_input = Input(shape=(max_len, 
                       max_len_char,),
               name='char_input')
char_emb = TimeDistributed(Embedding(input_dim=n_chars+2,
                                     output_dim=10,
                                     input_length=max_len_char,
                                     mask_zero=True,
                                     name='char_embedding'))(char_input)

char_encoder = TimeDistributed(LSTM(units=20, 
                                    return_sequences=False,
                                    recurrent_dropout=0.5,
                                    name='char_encoder'))(char_emb)

#dropout = Dropout(0.1, name='dropout')(word_embedding)

merged_input = concatenate([word_dropout, char_encoder])

blstm = Bidirectional(LSTM(units=100, 
                           return_sequences=True, 
                           recurrent_dropout=0.1),
                      name='BiLSTM')(merged_input)

output = TimeDistributed(Dense(n_tags+1, 
                               activation='softmax'), 
                        name='output')(blstm)
model = Model(inputs=[word_input, char_input], outputs=output)

In [228]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 75)           0                                            
__________________________________________________________________________________________________
char_input (InputLayer)         (None, 75, 10)       0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 75, 50)       1759000     word_input[0][0]                 
__________________________________________________________________________________________________
time_distributed_40 (TimeDistri (None, 75, 10, 10)   1000        char_input[0][0]                 
__________________________________________________________________________________________________
word_dropo

In [229]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [230]:
type(X_char_train)

list

In [231]:
history = model.fit([X_train, np.array(X_char_train)], 
                    np.array(y_train),
                    batch_size=64, 
                    epochs=5, 
                    validation_split=0.1, 
                    verbose=1)

# history = model.fit([X_train,
#                      np.array(X_char_train).reshape((len(X_char_train), max_len, max_len_char))],
#                     np.array(y_train).reshape(len(y_train), max_len, 1),
#                     batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Train on 38846 samples, validate on 4317 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
np.array(X_char_train).shape

(43163, 75, 10)