https://www.depends-on-the-definition.com/named-entity-recognition-with-residual-lstm-and-elmo/

In [1]:
import numpy as np
import os
import pandas as pd
import re

In [2]:
data_dir = r'Data/'

In [3]:
file = np.loadtxt(fname=data_dir+'nertrnweather.txt',encoding='utf-8',dtype=str)
file

array([['अरे', 'O'],
       ['सारथी', 'O'],
       ['जम्मू', 'U-location'],
       ...,
       ['क्या', 'O'],
       ['है', 'O'],
       ['?', 'O']], dtype='<U14')

In [4]:
df = pd.DataFrame(columns=['Sentence #','Word','Tag'])

In [5]:
i=1
data = list()
with open(data_dir+'nertrnweather.txt',encoding='utf-8') as f:
    for line in f.readlines():
        if line=='\n':
            i+=1
        else:
            data=line.split(" ")
            df=df.append({"Sentence #":f"Sentence {i}","Word":data[0],"Tag":re.sub("\n","",data[1])},ignore_index=True)
df

Unnamed: 0,Sentence #,Word,Tag
0,Sentence 1,अरे,O
1,Sentence 1,सारथी,O
2,Sentence 1,जम्मू,U-location
3,Sentence 1,कैसा,O
4,Sentence 1,मौसम,O
...,...,...,...
1403,Sentence 203,की,O
1404,Sentence 203,स्थिति,O
1405,Sentence 203,क्या,O
1406,Sentence 203,है,O


In [6]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

282

In [7]:
tags = list(set(df["Tag"].values))
n_tags = len(tags); n_tags

10

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except Exception as e:
            print(e)
            return None

In [9]:
getter = SentenceGetter(df)

In [10]:
sent = getter.get_next()
print(sent)

[('अरे', 'O'), ('सारथी', 'O'), ('जम्मू', 'U-location'), ('कैसा', 'O'), ('मौसम', 'O'), ('है', 'O')]


In [11]:
sentences = getter.sentences

In [12]:
max_len = 16
tag2idx = {t: i for i, t in enumerate(tags)}

In [13]:
tag2idx["U-location"]

9

In [14]:
X = [[w[0] for w in s] for s in sentences]

In [15]:
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

In [16]:
print(len(X))

203


In [17]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [18]:
from keras.preprocessing.sequence import pad_sequences
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [19]:
y[1]

array([9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.14, random_state=2018)

In [22]:
batch_size = 29

In [23]:
from allennlp.modules.elmo import Elmo,batch_to_ids

In [24]:
datadir = os.path.join('hi', 'elmo')
options_file = os.path.join(datadir, 'hi-d512-options.json')
weight_file = os.path.join(datadir, 'hi-d512-elmo.hdf5')

elmo = Elmo(options_file,weight_file,1,dropout=0)

In [29]:
embeds = elmo(batch_to_ids(X))

In [30]:
embedding = embeds["elmo_representations"]

In [31]:
embedding[0].shape

torch.Size([203, 16, 1024])

In [32]:
import torch
import tensorflow as tf

embedding1 = []
for t in embedding:
    np_tensor = t.detach().numpy()
    embedding1 += [np_tensor.tolist()]

embedding = embedding1


In [33]:
print(len(embedding[0]))

203


In [34]:
y.shape

(203, 16)

In [35]:
X_tr, X_te, y_tr, y_te = train_test_split(embedding[0], y, test_size=0.14, random_state=2018)

In [36]:
# import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
from keras import backend as K
# import keras.backend.tensorflow_backend as K

In [37]:
sess = tf.Session()
# K.set_session(sess)
tf.compat.v1.keras.backend.set_session(sess);

In [38]:
tf.disable_eager_execution()
# tf.enable_eager_execution()

In [39]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [40]:
input_text = Input(shape=(max_len,1024), dtype=tf.float32)
# embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
# embedding = embedding_layer(input_text)
# embedding = ElmoEmbeddingLayer()(input_text)
# print(embedding)
# embedding = Lambda(ElmoEmbedding,output_shape=(None, 1024))(input_text)
# embedding = Lambda(ElmoEmbedding,output_shape=(None, 1024))
# x = Bidirectional(LSTM(units=512, return_sequences=True,
#                        recurrent_dropout=0.2, dropout=0.2))(embedding)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(input_text)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [41]:
# input_text = Input(shape=(max_len,), dtype=tf.string)
model = Model(input_text, out)

In [42]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [43]:
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_te = y_te.reshape(y_te.shape[0], y_te.shape[1], 1)

In [44]:
print(np.array(X_te).shape)
print(np.array(X_tr).shape)

(29, 16, 1024)
(174, 16, 1024)


In [45]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_te), y_te),
                    batch_size=batch_size, epochs=5, verbose=1)
# history = model.fit(np.array(embedding), y_tr, validation_data=(np.array(X_te), y_te),
#                     batch_size=batch_size, epochs=5, verbose=1)

Train on 174 samples, validate on 29 samples
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [46]:
model.save("ner_elmo_bilstm.h5")

In [47]:
np.array(X_te[i:i+batch_size]).shape

(14, 16, 1024)

In [48]:
i=0
p = model.predict(np.array(X_te[i:i+batch_size]))
p = np.argmax(p, axis=-1)

In [49]:
p.shape

(29, 16)

In [50]:
for X_te1, y_te1,p1 in zip(X_te,y_te.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({})".format(w,tags[pred])
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)

Sentence  120
 सिडनी (U-location) का (O) मौसम (O)
Sentence  72
 किधर (O) ज्यादा (O) ठंडी (U-weather_type) होगी (O) कसोल (U-location) या (O) मनाली (U-location)
Sentence  152
 आज (U-date) गर्मी (U-weather_type) ज्यादा (O) है (O) क्या (O)
Sentence  84
 कहा (O) मौसम (O) अच्छा (O) है (O) बेंगलोरे (U-location) या (O) गुड़गांव (U-location)
Sentence  66
 जौनपुर (U-location) कितना (O) ज्यादा (O) गर्म (U-weather_type) है (O) अभी (U-date) सूरत (U-location) से (O)
Sentence  178
 क्या (O) मैं (O) घर (O) के (O) रास्ते (O) पर (O) भीगने (O) वाला (O) हूं (O)
Sentence  5
 आज (U-date) बरसात (U-weather_type) की (O) संभावना (O) है (O) क्या (O)
Sentence  165
 और (O) कितना (O) गर्मी (U-weather_type) बढ़ेगा (O) और (O) यहाँ (O)
Sentence  2
 और (O) कितनी (O) बारिश (U-weather_type) होगी (O) यहाँ (O)
Sentence  37
 सर्दी (U-weather_type) कब (O) से (O) चालू (O) होगी (O) इस (B-date) बार (L-date)
Sentence  130
 आज (U-date) मौसम (O) का (O) क्या (O) हाल (O) है (O)
Sentence  126
 आज (U-date) के (O) लिए (O) मौसम (O) रिपोर्

In [51]:
i = 0
p = model.predict(np.array(X_tr[i:i+batch_size]))
p = np.argmax(p, axis=-1)

In [52]:
for X_te1, y_te1,p1 in zip(X_tr,y_tr.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({})".format(w,tags[pred])
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)

Sentence  201
 मुंबई (U-location) में (O) बारिश (U-weather_type) होगी (O) क्या (O) आज (U-date)
Sentence  168
 क्या (O) कोटा (U-location) में (O) लू (U-weather_type) चल (O) रही (O) है (O)
Sentence  56
 कल (U-date) मौसम (O) कैसा (O) रहने (O) वाला (O) है (O)
Sentence  27
 क्या (O) बर्फ (U-weather_type) बरी (O) चालू (O) है (O) मनाली (U-location) में (O)
Sentence  113
 सबसे (O) ठंडी (U-weather_type) जगह (O) कौनसी (O) है (O) दुनिया (U-location) में (O)
Sentence  63
 बैंगलोर (U-location) ठंडा (U-weather_type) है (O) क्या (O) गुड़गांव (U-location) से (O)
Sentence  79
 पटना (U-location) कितना (O) डिग्री (O) ज्यादा (O) ठंडा (U-weather_type) है (O) सूरत (U-location) से (O)
Sentence  15
 क्या (O) बेंगलुरु (U-location) में (O) बारिश (U-weather_type) हो (O) रही (O) है (O)
Sentence  21
 क्या (O) बैंगलोर (U-location) में (O) हिमपात (U-weather_type) हो (O) रहा (O) है (O)
Sentence  7
 क्या (O) आज (U-date) जोरदार (O) बारिश (U-weather_type) होने (O) वाली (O) है (O) नागपुर (U-location) में (O)
Sentence  78

In [53]:
df = pd.DataFrame(columns=['Sentence #','Word','Tag'])

In [54]:
i=1
data = list()
with open(data_dir+'nervalweather.txt',encoding='utf-8') as f:
    for line in f.readlines():
        if line=='\n':
            i+=1
        else:
            data=line.split(" ")
            df=df.append({"Sentence #":f"Sentence {i}","Word":data[0],"Tag":re.sub("\n","",data[1])},ignore_index=True)
df

Unnamed: 0,Sentence #,Word,Tag
0,Sentence 1,आज,U-date
1,Sentence 1,मौसम,O
2,Sentence 1,कैसा,O
3,Sentence 1,है,O
4,Sentence 2,क्या,O
...,...,...,...
125,Sentence 21,रेणुकूट,U-location
126,Sentence 21,में,O
127,Sentence 21,मौसम,O
128,Sentence 21,कैसा,O


In [55]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

57

In [56]:
getter = SentenceGetter(df)

In [57]:
sentences = getter.sentences

In [58]:
max_len = 16
tag2idx = {t: i for i, t in enumerate(tags)}

In [59]:
X_test = [[w[0] for w in s] for s in sentences]

In [60]:
new_X = []
for seq in X_test:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_test = new_X

In [61]:
y_test = [[tag2idx[w[1]] for w in s] for s in sentences]

In [62]:
from keras.preprocessing.sequence import pad_sequences
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])

In [63]:
print(len(X_test))
X_test += X_test[0:8]
print(len(X_test))

#making size = batch_size

21
29


In [64]:
embeds = elmo(batch_to_ids(X_test))

In [65]:
embedding = embeds["elmo_representations"]
# embedding

In [66]:
import torch
import tensorflow as tf

embedding1 = []
for t in embedding:
    np_tensor = t.detach().numpy()
    embedding1 += [np_tensor.tolist()]

embedding = embedding1

In [67]:
i = 0

p = model.predict(np.array(embedding[0]))
p = np.argmax(p, axis=-1)

In [68]:
for X_te1, y_te1,p1 in zip(embedding[0],y_test.tolist(),p):
    for i,e in enumerate(embedding[0]):
        if e==X_te1:
            print("Sentence ", i)
            sentence = ""
#             c+=1
            for w, true, pred in zip(X_test[i], y_te1, p1):
                if w != "__PAD__":
                    sentence += " {} ({})".format(w,tags[pred])
#                 if w != "__PAD__" and tags[pred] == tags[true[0]]:
#                     print("{} : {} ({})".format(w,tags[pred], tags[true[0]]))
            print(sentence)

Sentence  0
 आज (U-date) मौसम (O) कैसा (O) है (O)
Sentence  1
 आज (U-date) ज्यादा (O) ठंड (U-weather_type) है (O) क्या (O)
Sentence  2
 मनाली (U-location) में (O) कितनी (O) ठंडी (U-weather_type) है (O)
Sentence  3
 कोटा (U-location) ठंडा (U-weather_type) है (O) क्या (O) दिल्ली (U-location) से (O)
Sentence  4
 कोटा (U-location) में (O) ज्यादा (O) ठंडी (U-weather_type) है (O) क्या (O)
Sentence  5
 पठानकोट (U-location) और (O) दिल्ली (U-location) में (O) कहा (O) ज्यादा (O) गर्मी (U-weather_type) होगी (O)
Sentence  6
 दुनिया (U-location) की (O) सबसे (O) नमी (U-weather_type) वाली (O) जगह (O)
Sentence  7
 बनारस (U-location) और (O) गाजीपुर (U-location) में (O) कहां (O) ज्यादा (O) गर्मी (U-weather_type) पड़ेगा (O)
Sentence  8
 बनारस (U-location) और (O) गाजीपुर (U-location) में (O) कहां (O) ज्यादा (O) ठंड (U-weather_type) पड़ेगा (O)
Sentence  9
 बनारस (U-location) और (O) गाजीपुर (U-location) में (O) कहाँ (O) ज्यादा (O) जाड़ा (O) पड़ेगा (O)
Sentence  10
 इस (B-date) साल (L-date) जाड़ा (O) पड़ेगा (O) 