In [1]:
import pandas as pd
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [2]:
data=pd.read_csv("dataset/ner.csv",encoding= 'unicode_escape')
data.head(5)

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Sentence: 1,Thousands,NNS,O
1,1,,of,IN,O
2,2,,demonstrators,NNS,O
3,3,,have,VBP,O
4,4,,marched,VBN,O


In [3]:
from itertools import chain
def make_dict_map(data, tokentag):
    token_to_idx = {}
    idx_to_token = {}  
    if tokentag == 'token':
        voc = list(set(data['Word'].to_list()))
    else:
        voc = list(set(data['Tag'].to_list()))
    idx_to_token = {idx:tok for  idx, tok in enumerate(voc)}
    token_to_idx = {tok:idx for  idx, tok in enumerate(voc)}
    return token_to_idx , idx_to_token

In [4]:
token_to_idx, idx_to_token = make_dict_map(data, 'token')
tag_to_idx, idx_to_tag = make_dict_map(data, 'tag')
data['Word_idx'] = data['Word'].map(token_to_idx)
data['Tag_idx'] = data['Tag'].map(tag_to_idx)
data_fillna = data.fillna(method='ffill', axis=0)

In [5]:
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))


In [7]:
def get_train_test_val(data_group, datas):
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    ntoken = len(list(set(datas['Word'].to_list())))
    ntag = len(list(set(datas['Tag'].to_list())))    
    padtokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= ntoken - 1)
    tags = data_group['Tag_idx'].tolist()
    padtags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_to_idx["O"])
    ntags = len(tag_to_idx)
    padtags = [to_categorical(i, num_classes=ntags) for i in padtags]
    tokens, testtokens, tags, testtags = train_test_split(padtokens, padtags, test_size=0.1, train_size=0.9, random_state=2020)
    traintokens, valtokens, traintags, valtags = train_test_split(tokens,tags,test_size = 0.25,train_size =0.75, random_state=2020)
    print(
        'length of train tokens :', len(train_tokens),
        '\nlength of train tags   :', len(train_tags),
        '\nlength of test tokens  :', len(test_tokens),
        '\nlength of test tags    :', len(test_tags),
        '\nlength of val tokens   :', len(val_tokens),
        '\nlength of val tags     :', len(val_tags),
    )
    
    return traintokens, testtokens, valtokens, traintags,testtags,valtags

In [9]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
ntags = len(tag_to_idx)
ntags

17

In [15]:
def get_bilstmlstm():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(1, activation="relu")))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [11]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [19]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstmlstm()
plot_model(model_bilstm_lstm)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_5 (Bidirectio  (None, 104, 128)         66048     
 nal)                                                            
                                                                 
 lstm_11 (LSTM)              (None, 104, 64)           49408     
                                                                 
 time_distributed_3 (TimeDis  (None, 104, 1)           65        
 tributed)                                                       
                                                                 
Total params: 2,366,977
Trainable params: 2,366,977
Non-trainable params: 0
_________________________________________________________________
('You must install pydot (`pip install pydot