https://www.depends-on-the-definition.com/named-entity-recognition-with-residual-lstm-and-elmo/

In [1]:
import numpy as np
import os
import pandas as pd
import re

In [2]:
data_dir = r'Data/'

In [3]:
file = np.loadtxt(fname=data_dir+'nertrnweather.txt',encoding='utf-8',dtype=str)
file

array([['अरे', 'O'],
       ['सारथी', 'O'],
       ['जम्मू', 'U-location'],
       ...,
       ['क्या', 'O'],
       ['है', 'O'],
       ['?', 'O']], dtype='<U14')

In [4]:
df = pd.DataFrame(columns=['Sentence #','Word','Tag'])

In [5]:
i=1
data = list()
with open(data_dir+'nertrnweather.txt',encoding='utf-8') as f:
    for line in f.readlines():
        if line=='\n':
            i+=1
        else:
            data=line.split(" ")
            df=df.append({"Sentence #":f"Sentence {i}","Word":data[0],"Tag":re.sub("\n","",data[1])},ignore_index=True)
df

Unnamed: 0,Sentence #,Word,Tag
0,Sentence 1,अरे,O
1,Sentence 1,सारथी,O
2,Sentence 1,जम्मू,U-location
3,Sentence 1,कैसा,O
4,Sentence 1,मौसम,O
...,...,...,...
1403,Sentence 203,की,O
1404,Sentence 203,स्थिति,O
1405,Sentence 203,क्या,O
1406,Sentence 203,है,O


In [6]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

282

In [50]:
tags = list(set(df["Tag"].values))
n_tags = len(tags); n_tags
tags

['B-date',
 'O',
 'B-weather_type',
 'L-date',
 'L-weather_type',
 'B-location',
 'U-date',
 'L-location',
 'U-location',
 'U-weather_type']

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except Exception as e:
            print(e)
            return None

In [9]:
getter = SentenceGetter(df)

In [10]:
sent = getter.get_next()
print(sent)

[('अरे', 'O'), ('सारथी', 'O'), ('जम्मू', 'U-location'), ('कैसा', 'O'), ('मौसम', 'O'), ('है', 'O')]


In [11]:
sentences = getter.sentences

In [12]:
max_len = 16
tag2idx = {t: i for i, t in enumerate(tags)}

In [13]:
tag2idx["U-location"]

8

In [14]:
X = [[w[0] for w in s] for s in sentences]

In [15]:
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

In [16]:
print(len(X))

203


In [17]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [18]:
from keras.preprocessing.sequence import pad_sequences
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [19]:
y[1]

array([8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.14, random_state=2018)

In [22]:
batch_size = 29

In [23]:
from allennlp.modules.elmo import Elmo,batch_to_ids

In [24]:
datadir = os.path.join('hi', 'elmo')
options_file = os.path.join(datadir, 'hi-d512-options.json')
weight_file = os.path.join(datadir, 'hi-d512-elmo.hdf5')

elmo = Elmo(options_file,weight_file,1,dropout=0)

In [25]:
embeds = elmo(batch_to_ids(X))

In [26]:
embedding = embeds["elmo_representations"]

In [27]:
embedding[0].shape

torch.Size([203, 16, 1024])

In [28]:
import torch
import tensorflow as tf

embedding1 = []
for t in embedding:
    np_tensor = t.detach().numpy()
    embedding1 += [np_tensor.tolist()]

embedding = embedding1


In [29]:
print(len(embedding[0]))

203


In [30]:
y.shape

(203, 16)

In [31]:
X_tr, X_te, y_tr, y_te = train_test_split(embedding[0], y, test_size=0.14, random_state=2018)

In [32]:
# import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
from keras import backend as K
# import keras.backend.tensorflow_backend as K

In [33]:
sess = tf.Session()
# K.set_session(sess)
tf.compat.v1.keras.backend.set_session(sess);

In [34]:
tf.disable_eager_execution()
# tf.enable_eager_execution()

In [35]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [36]:
input_text = Input(shape=(max_len,1024), dtype=tf.float32)
# embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
# embedding = embedding_layer(input_text)
# embedding = ElmoEmbeddingLayer()(input_text)
# print(embedding)
# embedding = Lambda(ElmoEmbedding,output_shape=(None, 1024))(input_text)
# embedding = Lambda(ElmoEmbedding,output_shape=(None, 1024))
# x = Bidirectional(LSTM(units=512, return_sequences=True,
#                        recurrent_dropout=0.2, dropout=0.2))(embedding)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(input_text)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [37]:
# input_text = Input(shape=(max_len,), dtype=tf.string)
model = Model(input_text, out)

In [38]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [39]:
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_te = y_te.reshape(y_te.shape[0], y_te.shape[1], 1)

In [40]:
print(np.array(X_te).shape)
print(np.array(X_tr).shape)

(29, 16, 1024)
(174, 16, 1024)


In [41]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_te), y_te),
                    batch_size=batch_size, epochs=5, verbose=1)
# history = model.fit(np.array(embedding), y_tr, validation_data=(np.array(X_te), y_te),
#                     batch_size=batch_size, epochs=5, verbose=1)

Train on 174 samples, validate on 29 samples
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [44]:
# model.save("ner_elmo_bilstm.h5")
# model.save('ner_elmo_bilstm.h5', overwrite=True,  save_format='tf', include_optimizer=True,)
# model.save_weights('ner_elmo_bilstm_weights.h5')
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")

In [45]:
np.array(X_te[i:i+batch_size]).shape

(14, 16, 1024)

In [46]:
i=0
p = model.predict(np.array(X_te[i:i+batch_size]))
p = np.argmax(p, axis=-1)

In [47]:
p.shape

(29, 16)

In [49]:
for X_te1, y_te1,p1 in zip(X_te,y_te.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({}) {}".format(w,tags[pred],pred)
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)

Sentence  120
 सिडनी (U-location) 8 का (O) 1 मौसम (O) 1
Sentence  72
 किधर (O) 1 ज्यादा (O) 1 ठंडी (U-weather_type) 9 होगी (O) 1 कसोल (U-location) 8 या (O) 1 मनाली (U-location) 8
Sentence  152
 आज (U-date) 6 गर्मी (U-weather_type) 9 ज्यादा (O) 1 है (O) 1 क्या (O) 1
Sentence  84
 कहा (O) 1 मौसम (O) 1 अच्छा (O) 1 है (O) 1 बेंगलोरे (U-location) 8 या (O) 1 गुड़गांव (U-location) 8
Sentence  66
 जौनपुर (U-location) 8 कितना (O) 1 ज्यादा (O) 1 गर्म (U-weather_type) 9 है (O) 1 अभी (U-date) 6 सूरत (U-location) 8 से (O) 1
Sentence  178
 क्या (O) 1 मैं (O) 1 घर (O) 1 के (O) 1 रास्ते (O) 1 पर (O) 1 भीगने (O) 1 वाला (O) 1 हूं (O) 1
Sentence  5
 आज (U-date) 6 बरसात (U-weather_type) 9 की (O) 1 संभावना (O) 1 है (O) 1 क्या (O) 1
Sentence  165
 और (O) 1 कितना (O) 1 गर्मी (U-weather_type) 9 बढ़ेगा (O) 1 और (O) 1 यहाँ (O) 1
Sentence  2
 और (O) 1 कितनी (O) 1 बारिश (U-weather_type) 9 होगी (O) 1 यहाँ (O) 1
Sentence  37
 सर्दी (U-weather_type) 9 कब (O) 1 से (O) 1 चालू (O) 1 होगी (O) 1 इस (B-date) 0 बार (L-date) 

In [None]:
i = 0
p = model.predict(np.array(X_tr[i:i+batch_size]))
p = np.argmax(p, axis=-1)

In [None]:
for X_te1, y_te1,p1 in zip(X_tr,y_tr.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({})".format(w,tags[pred])
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)

In [None]:
df = pd.DataFrame(columns=['Sentence #','Word','Tag'])

In [None]:
i=1
data = list()
with open(data_dir+'nervalweather.txt',encoding='utf-8') as f:
    for line in f.readlines():
        if line=='\n':
            i+=1
        else:
            data=line.split(" ")
            df=df.append({"Sentence #":f"Sentence {i}","Word":data[0],"Tag":re.sub("\n","",data[1])},ignore_index=True)
df

In [None]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

In [None]:
getter = SentenceGetter(df)

In [None]:
sentences = getter.sentences

In [None]:
max_len = 16
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
X_test = [[w[0] for w in s] for s in sentences]

In [None]:
new_X = []
for seq in X_test:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_test = new_X

In [None]:
y_test = [[tag2idx[w[1]] for w in s] for s in sentences]

In [None]:
from keras.preprocessing.sequence import pad_sequences
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])

In [None]:
print(len(X_test))
X_test += X_test[0:8]
print(len(X_test))

#making size = batch_size

In [None]:
embeds = elmo(batch_to_ids(X_test))

In [None]:
embedding = embeds["elmo_representations"]
# embedding

In [None]:
import torch
import tensorflow as tf

embedding1 = []
for t in embedding:
    np_tensor = t.detach().numpy()
    embedding1 += [np_tensor.tolist()]

embedding = embedding1

In [None]:
i = 0

p = model.predict(np.array(embedding[0]))
p = np.argmax(p, axis=-1)

In [None]:
for X_te1, y_te1,p1 in zip(embedding[0],y_test.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X_test[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({})".format(w,tags[pred])
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)