### This notebook predicts the Named Entities for a selected string from Climate-Change dataset

In [1]:
#Importing libraries
from tensorflow import keras
import pandas as pd
import numpy as np
import json
import itertools
from keras.preprocessing.sequence import pad_sequences

#Custom library
from predifinedfunctions import characterSplit, addDummy, predictedDisplay, predictTags, padding, createMatrices, getCasing
# For prediction validation
import spacy
from spacy import displacy

In [2]:
#Importing NER Recognition models

#BILSTM
BiLSTM_Final_model = keras.models.load_model('TrainedModels/BiLSTM_Final_model.h5')
with open('word2index.json', 'r') as fp:
    word2index_1 = json.load(fp)    
with open('tag2index.json', 'r') as fp:
    tag2index_1 = json.load(fp)

#BILSTM-Strict-Clean
BiLSTM_Final_model_strict_clean = keras.models.load_model('TrainedModels/BiLSTM_Final_model_Strict_Clean.h5')
with open('word2index_strict_cleaned.json', 'r') as fp:
    word2index_2 = json.load(fp)    
with open('tag2index_strict_cleaned.json', 'r') as fp:
    tag2index_2 = json.load(fp)

#Bi-LSTM_CNN_GloVe    
BiLSTM_CNN_model_glove = keras.models.load_model('TrainedModels/Full_Trained_model_glove.h5')
with open('idx2Word_Glove.json', 'r') as fp:
    idx2Word_Glove = json.load(fp)    
with open('idx2Label_Glove.json', 'r') as fp:
    idx2Label_Glove = json.load(fp)

#Bi-LSTM_CNN_nonGloVe  
BiLSTM_CNN_model_non_glove = keras.models.load_model('TrainedModels/Full_Trained_model_non_glove.h5')
with open('idx2Word_nonGlove.json', 'r') as fp:
    idx2Word_nonGlove = json.load(fp)    
with open('idx2Label_nonGlove.json', 'r') as fp:
    idx2Label_nonGlove = json.load(fp)
    
#Importing additional dependencies    
with open('case2Idx.json', 'r') as fp:
    case2Idx = json.load(fp)    
with open('char2Idx.json', 'r') as fp:
    char2Idx = json.load(fp)
    

In [3]:
#Readig the Climate-Change Dataset and selecting top 10k tweets
ClimateChange_DF = pd.read_csv("Cleaned_English_tweets.csv")
ClimateChange_DF = ClimateChange_DF[["CleanedTweets"]].iloc[:10000].copy()
ClimateChange_DF.head()
ClimateChange_DF.shape

(10000, 1)

In [4]:
# Creating a list of word index with padding for the first two models
ClimateChange_DF["1Word"] = ClimateChange_DF["CleanedTweets"].apply(lambda x: str(x).split())
ClimateChange_DF["2Word"] = ClimateChange_DF["CleanedTweets"].apply(lambda x: str(x).lower().split())
ClimateChange_DF['1Word_Index'] = ClimateChange_DF['1Word'].apply(lambda x: [word2index_1[s] if s in word2index_1 else word2index_1['UNKNOWN_WORD'] for s in x])
ClimateChange_DF['2Word_Index'] = ClimateChange_DF['2Word'].apply(lambda x: [word2index_2[s] if s in word2index_2 else word2index_2['UNKNOWN_WORD'] for s in x])
ClimateChange_DF["1Padded_Word_Index"] = ClimateChange_DF["1Word_Index"].apply(lambda x: x + [word2index_1["PADDING"]] * (40 - len(x)) if (len(x) <=40) else x[:40])
ClimateChange_DF["2Padded_Word_Index"] = ClimateChange_DF["2Word_Index"].apply(lambda x: x + [word2index_2["PADDING"]] * (40 - len(x)) if (len(x) <=40) else x[:40])
ClimateChange_DF["3Word"] = ClimateChange_DF["CleanedTweets"].apply(lambda x: [[i] for i in str(x).split()])

In [5]:
ClimateChange_DF.head()

Unnamed: 0,CleanedTweets,1Word,2Word,1Word_Index,2Word_Index,1Padded_Word_Index,2Padded_Word_Index,3Word
0,News Trends Data Americans are less concerned ...,"[News, Trends, Data, Americans, are, less, con...","[news, trends, data, americans, are, less, con...","[26341, 32355, 1041, 4030, 23216, 10517, 25944...","[2165, 325, 9098, 19174, 14213, 11148, 14962, ...","[26341, 32355, 1041, 4030, 23216, 10517, 25944...","[2165, 325, 9098, 19174, 14213, 11148, 14962, ...","[[News], [Trends], [Data], [Americans], [are],..."
1,Do you realize that civil war is the devastati...,"[Do, you, realize, that, civil, war, is, the, ...","[do, you, realize, that, civil, war, is, the, ...","[18091, 14666, 20342, 26653, 2796, 32980, 6563...","[11989, 23784, 6400, 8775, 1376, 15117, 8950, ...","[18091, 14666, 20342, 26653, 2796, 32980, 6563...","[11989, 23784, 6400, 8775, 1376, 15117, 8950, ...","[[Do], [you], [realize], [that], [civil], [war..."
2,Having anxiety over the weather something they...,"[Having, anxiety, over, the, weather, somethin...","[having, anxiety, over, the, weather, somethin...","[214, 19769, 29935, 8235, 21182, 2958, 35049, ...","[5249, 16605, 15306, 31082, 10820, 26137, 2720...","[214, 19769, 29935, 8235, 21182, 2958, 35049, ...","[5249, 16605, 15306, 31082, 10820, 26137, 2720...","[[Having], [anxiety], [over], [the], [weather]..."
3,In the last few years I've noticed that studen...,"[In, the, last, few, years, I've, noticed, tha...","[in, the, last, few, years, i've, noticed, tha...","[15224, 8235, 8856, 29571, 31255, 0, 12021, 26...","[18193, 31082, 27602, 3214, 21234, 0, 13170, 8...","[15224, 8235, 8856, 29571, 31255, 0, 12021, 26...","[18193, 31082, 27602, 3214, 21234, 0, 13170, 8...","[[In], [the], [last], [few], [years], [I've], ..."
4,FULL INTERVIEW BTS ARMY BTSonGMA NEWS EXCLUSIV...,"[FULL, INTERVIEW, BTS, ARMY, BTSonGMA, NEWS, E...","[full, interview, bts, army, btsongma, news, e...","[0, 0, 0, 0, 0, 0, 0, 30322, 21483, 16752, 149...","[2986, 12730, 0, 24604, 0, 2165, 2281, 29986, ...","[0, 0, 0, 0, 0, 0, 0, 30322, 21483, 16752, 149...","[2986, 12730, 0, 24604, 0, 2165, 2281, 29986, ...","[[FULL], [INTERVIEW], [BTS], [ARMY], [BTSonGMA..."


In [6]:
#Creating an input list for Glove model
model_input = ClimateChange_DF["3Word"].tolist()

In [7]:
#Splitting Characters
predicting_sentence = characterSplit(model_input)
#Preprocessing data by adding dummy tags           
predicting_sentence = addDummy(predicting_sentence)

In [8]:
#Creating lookup dictionaries for tags and words
index2Tag_1 = {value: key for key, value in tag2index_1.items()}
index2Tag_2 = {value: key for key, value in tag2index_2.items()}
word2Idx_Glove = {value: int(key) for key, value in idx2Word_Glove.items()}
word2Idx_nonGlove = {value: int(key) for key, value in idx2Word_nonGlove.items()}

In [9]:
#Instantiating model inputs
model_1_input = ClimateChange_DF["1Padded_Word_Index"]
model_2_input = ClimateChange_DF["2Padded_Word_Index"]
model_3_input = padding(createMatrices(predicting_sentence, word2Idx_Glove, case2Idx, char2Idx), 52)
model_4_input = padding(createMatrices(predicting_sentence, word2Idx_nonGlove, case2Idx, char2Idx), 28)

In [10]:
# Predicting NER using all 4 models and displaying the words with their tags

index = np.random.randint(0, 10000)
print(index)
#Model Prediction
model1_pred, model2_pred , model3_pred, model4_pred = predictTags(index, 
                                                                  BiLSTM_Final_model, model_1_input,
                                                                  BiLSTM_Final_model_strict_clean, model_2_input,
                                                                  BiLSTM_CNN_model_glove, model_3_input,
                                                                  BiLSTM_CNN_model_non_glove, model_4_input)
#Predicted Entity display                                                                 
predictedDisplay(ClimateChange_DF, index, model1_pred, index2Tag_1, model2_pred,index2Tag_2, model3_pred,idx2Label_Glove,
                 model4_pred, idx2Label_nonGlove)

#To validate using an external NLP library and printing entities
nlp = spacy.load('en_core_web_sm')
text = nlp(ClimateChange_DF.CleanedTweets[index])
displacy.render(text, style = 'ent', jupyter=True)

1685
ACTUAL_WORD         BiLSTM         BiLSTM_SC      BiLSTM_CONN_GLOVE   BiLSTM_CONN_NON_GLOVE
----------------------------------------------------------------------------------------------------
FULL                B-org           O               O                   O
INTERVIEW           I-org           O               O                   O
BTS                 I-per           B-org           O                   O
ARMY                I-per           I-org           I-org               O
BTSonGMA            I-per           I-org           I-org               O
NEWS                I-per           O               I-org               O
EXCLUSIVE           I-per           O               O                   O
sits                O               O               O                   O
down                O               O               O                   O
with                O               O               O                   O
pop                 O               O               O         

In [139]:
ClimateChange_DF.CleanedTweets[index]

' '