## Preparing the data for the BiLSTM CNN Model 

In [1]:
#Importing dependencies for EDA
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import json

import spacy
from spacy import displacy

from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate

from tensorflow.keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform

#Importing the below block to display all outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [54]:
#Reading the training file
df = pd.read_csv("ner_datasetreference.csv", encoding='latin')
df = df.fillna(method='ffill')
df["Sentence #"] = df["Sentence #"].apply(lambda s: s[9:]).astype("int32")

In [55]:
#Checking for empty rows
df[df['Word'] == '']

Unnamed: 0,Sentence #,Word,POS,Tag


In [56]:
#Creating a dictionary to replace these latin characters 
replacement_dict = {'ë':'e','ü':'u',"\xa0":' ', 'é':'e', '\x93':' ','\x91':' ','\x97':' ','\x85':' ','\x94':' ','ö':'o' ,'°':' ', 
                   '\x92':' ','\x96':' '}

def cleanunicode(uncleanstring):
    return re.sub(r'[\xa0|é|ë|ü|\x93|\x91|\x97|\x85|\x94|ö|°|\x92|\x96]', lambda m: replacement_dict.get(m.group()), uncleanstring)

df['Word'] = df['Word'].apply(lambda x : cleanunicode(x))

In [57]:
#Checking for empty rows
df[df['Word'] == '']

Unnamed: 0,Sentence #,Word,POS,Tag


In [58]:
# Word and tag is combined in a list for each sentence another list is consuructed
df['combined']= df[['Word','Tag']].values.tolist()
df = df.groupby(['Sentence #'])['combined'].agg(lambda x: list(x))

In [59]:
#Looking at the number of sentences
trainSentences = df.tolist()
len(trainSentences)

47959

In [8]:
#Splitting the data into train test split of 80:20 ration
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(trainSentences, test_size=0.2, random_state=123)

In [9]:
#Looking the the length of samples sent for training and test
len(X_train)
len(X_test)

38367

9592

In [10]:
# Displaying the first sentence
X_train[0]

[['No', 'O'],
 ['official', 'O'],
 ['announcement', 'O'],
 ['has', 'O'],
 ['come', 'O'],
 ['from', 'O'],
 ['the', 'O'],
 ['Iraqi', 'B-gpe'],
 ['Special', 'O'],
 ['Tribunal', 'O'],
 ['in', 'O'],
 ['charge', 'O'],
 ['of', 'O'],
 ['the', 'O'],
 ['trials', 'O'],
 [',', 'O'],
 ['but', 'O'],
 ['officials', 'O'],
 ['close', 'O'],
 ['to', 'O'],
 ['the', 'O'],
 ['case', 'O'],
 ['said', 'O'],
 ['Friday', 'B-tim'],
 ['that', 'O'],
 ['Saddam', 'B-per'],
 ['Hussein', 'I-per'],
 ['will', 'O'],
 ['be', 'O'],
 ['tried', 'O'],
 ['for', 'O'],
 ['the', 'O'],
 ['1982', 'B-tim'],
 ['killing', 'O'],
 ['of', 'O'],
 ['dozens', 'O'],
 ['of', 'O'],
 ['residents', 'O'],
 ['of', 'O'],
 ['the', 'O'],
 ['town', 'O'],
 ['of', 'O'],
 ['Dujail', 'B-geo'],
 ['.', 'O']]

In [11]:
#This function creates a list of characters in a word and appends that list to the training set as shown below
def characterSplit(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars,data[1]]
    return Sentences

X_train = characterSplit(X_train)
X_test = characterSplit(X_test)

In [12]:
# Displaying the first sentence after character split
X_train[0]

[['No', ['N', 'o'], 'O'],
 ['official', ['o', 'f', 'f', 'i', 'c', 'i', 'a', 'l'], 'O'],
 ['announcement',
  ['a', 'n', 'n', 'o', 'u', 'n', 'c', 'e', 'm', 'e', 'n', 't'],
  'O'],
 ['has', ['h', 'a', 's'], 'O'],
 ['come', ['c', 'o', 'm', 'e'], 'O'],
 ['from', ['f', 'r', 'o', 'm'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['Iraqi', ['I', 'r', 'a', 'q', 'i'], 'B-gpe'],
 ['Special', ['S', 'p', 'e', 'c', 'i', 'a', 'l'], 'O'],
 ['Tribunal', ['T', 'r', 'i', 'b', 'u', 'n', 'a', 'l'], 'O'],
 ['in', ['i', 'n'], 'O'],
 ['charge', ['c', 'h', 'a', 'r', 'g', 'e'], 'O'],
 ['of', ['o', 'f'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['trials', ['t', 'r', 'i', 'a', 'l', 's'], 'O'],
 [',', [','], 'O'],
 ['but', ['b', 'u', 't'], 'O'],
 ['officials', ['o', 'f', 'f', 'i', 'c', 'i', 'a', 'l', 's'], 'O'],
 ['close', ['c', 'l', 'o', 's', 'e'], 'O'],
 ['to', ['t', 'o'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['case', ['c', 'a', 's', 'e'], 'O'],
 ['said', ['s', 'a', 'i', 'd'], 'O'],
 ['Friday', ['F', 'r', 'i', 'd', '

In [15]:
#This loops create a set of unique tags and a dictionary of unique words
labelSet = set()
words = {}

for dataset in [X_train, X_test]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

In [16]:
labelSet

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [17]:
print("No of unique words in the dataset: ", len(words)) 

No of unique words in the dataset:  31802


In [18]:
# Each label is given a unique index 
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [20]:
#Creating unique dictionary of NER tags and saving to json for future use
label2Idx

idx2Label = {v: k for k, v in label2Idx.items()}

with open('idx2Label_Glove.json', 'w') as fp:
    json.dump(idx2Label, fp)

{'B-geo': 0,
 'B-nat': 1,
 'I-per': 2,
 'I-geo': 3,
 'B-eve': 4,
 'B-art': 5,
 'B-per': 6,
 'B-gpe': 7,
 'O': 8,
 'I-eve': 9,
 'I-org': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-org': 13,
 'I-nat': 14,
 'I-art': 15,
 'B-tim': 16}

In [21]:
# Creating a custom case loop for additional layer, added aditional cases apart from the  
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [22]:
#Displaying the cae embedded dictionary and one hot encoded vector
case2Idx
caseEmbeddings

{'numeric': 0,
 'allLower': 1,
 'allUpper': 2,
 'initialUpper': 3,
 'other': 4,
 'mainly_numeric': 5,
 'contains_digit': 6,
 'PADDING_TOKEN': 7}

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [23]:
# Using the Glove 100D word embedding to fetch the word vectors
word2Idx = {}
wordEmbeddings = []

fEmbeddings = open("embeddings/glove.6B.100d.txt", encoding="utf-8")

In [24]:
#Below for loop loops through all the unique words from the dataset and checks if the word is in the glove vocab 
# if yes fetches the embedded vector if no UNKNOWNN_TOKEN will be used later to replace them
for line in fEmbeddings:
    split = line.strip().split(" ")
    word = split[0]
   
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:  #words has all the unique words from the input dataset
        vector = np.array([float(num) for num in split[1:]]) #Because the array is list of strings it converts into float
        wordEmbeddings.append(vector) 
        word2Idx[split[0]] = len(word2Idx)

In [25]:
deletedwords = {k:v for k,v in words.items() if k not in word2Idx}

In [26]:
deletedwords

{'06-mar': True,
 '07-jun': True,
 'british-sponsored': True,
 'ping-kun': True,
 'ghangzhou': True,
 'esquisabel': True,
 'urtuzaga': True,
 'brain-wasting': True,
 'oil-and-gas': True,
 "al-madai'ni": True,
 'below-normal': True,
 'abu-hafs': True,
 'non-electrical': True,
 'kentung': True,
 'natama': True,
 'high-seas': True,
 '8,00,000': True,
 'british-dutch': True,
 'mientes': True,
 'dejarte': True,
 'al-muasher': True,
 'non-jordanians': True,
 'mladjen': True,
 'kenjic': True,
 'ethiopian-born': True,
 'al-bahlul': True,
 'el-maan': True,
 'froce': True,
 'israel-syria': True,
 'hamas-led': True,
 'supamongkhon': True,
 '07-feb': True,
 'arcega': True,
 'uncolonized': True,
 'al-ghad': True,
 'corruption-related': True,
 'recently-repaired': True,
 'dong-young': True,
 'six-nation': True,
 'swear-in': True,
 '60-thousand': True,
 'al-adwa': True,
 'african-union': True,
 '745.71': True,
 '65.32': True,
 'shakiso': True,
 'admhaiyah': True,
 'tachileik': True,
 '7,00,000': True

In [28]:
#Saving the word to index for future prediction use
word2Idx
idx2Word = {v: k for k, v in word2Idx.items()}
with open('idx2Word_Glove.json', 'w') as fp:
    json.dump(idx2Word, fp)

{'PADDING_TOKEN': 0,
 'UNKNOWN_TOKEN': 1,
 'the': 2,
 ',': 3,
 '.': 4,
 'of': 5,
 'to': 6,
 'and': 7,
 'in': 8,
 'a': 9,
 '"': 10,
 "'s": 11,
 'for': 12,
 '-': 13,
 'that': 14,
 'on': 15,
 'is': 16,
 'was': 17,
 'said': 18,
 'with': 19,
 'he': 20,
 'as': 21,
 'it': 22,
 'by': 23,
 'at': 24,
 '(': 25,
 ')': 26,
 'from': 27,
 'his': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you':

In [29]:
#Converting the list of list to array for model weights
wordEmbeddings = np.array(wordEmbeddings)
wordEmbeddings

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.12729585,  0.03252421, -0.20221775, ..., -0.0215988 ,
        -0.00363683,  0.20118529],
       [-0.038194  , -0.24487   ,  0.72812   , ..., -0.1459    ,
         0.8278    ,  0.27062   ],
       ...,
       [ 0.051214  ,  0.46039   ,  0.26446   , ...,  0.90017   ,
        -0.019423  , -0.27108   ],
       [ 0.089657  , -0.084513  , -0.056271  , ...,  0.65772   ,
        -0.24955   , -0.23732   ],
       [-0.013493  , -0.25268   , -0.5281    , ..., -0.10441   ,
        -0.47526   , -0.56902   ]])

In [30]:
#Creating a character index dictionary of all possible english characters
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)
len(char2Idx)
char2Idx

95

{'PADDING': 0,
 'UNKNOWN': 1,
 ' ': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'k': 23,
 'l': 24,
 'm': 25,
 'n': 26,
 'o': 27,
 'p': 28,
 'q': 29,
 'r': 30,
 's': 31,
 't': 32,
 'u': 33,
 'v': 34,
 'w': 35,
 'x': 36,
 'y': 37,
 'z': 38,
 'A': 39,
 'B': 40,
 'C': 41,
 'D': 42,
 'E': 43,
 'F': 44,
 'G': 45,
 'H': 46,
 'I': 47,
 'J': 48,
 'K': 49,
 'L': 50,
 'M': 51,
 'N': 52,
 'O': 53,
 'P': 54,
 'Q': 55,
 'R': 56,
 'S': 57,
 'T': 58,
 'U': 59,
 'V': 60,
 'W': 61,
 'X': 62,
 'Y': 63,
 'Z': 64,
 '.': 65,
 ',': 66,
 '-': 67,
 '_': 68,
 '(': 69,
 ')': 70,
 '[': 71,
 ']': 72,
 '{': 73,
 '}': 74,
 '!': 75,
 '?': 76,
 ':': 77,
 ';': 78,
 '#': 79,
 "'": 80,
 '"': 81,
 '/': 82,
 '\\': 83,
 '%': 84,
 '$': 85,
 '`': 86,
 '&': 87,
 '=': 88,
 '*': 89,
 '+': 90,
 '@': 91,
 '^': 92,
 '~': 93,
 '|': 94}

In [31]:
#The below function checks for a unique case of a word given from the predefined case rules and applies the specified case 
def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]

In [32]:
#This function takes the document and replaces the actual word/tag/case/character into its respective index
def createMatrices(document, word2Idx, label2Idx, case2Idx,char2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for document in document:
        wordIndices = []    
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in document:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
            charIdx = []
            for x in char:
                charIdx.append(char2Idx[x])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

In [33]:
#This function only adds padding to the character list of length 52 which represents the longest word
def padding(document):
    maxlen = 52 # using 52 which represents the longest word
    for sentence in document:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(document):
        document[i][2] = pad_sequences(document[i][2],52,padding='post')
    return document

In [34]:
#Above three functions are called on the train and test data
train_set = padding(createMatrices(X_train,word2Idx,  label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(X_test, word2Idx, label2Idx, case2Idx,char2Idx))

In [35]:
#This function O/P the dataset and also o/p a list which contain the no of batches and its size
def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

In [None]:
trainSentences = characterSplit(trainSentences)
full_train_data = padding(createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx,char2Idx))
full_train_data_batch,full_train_data_batch_len = createBatches(full_train_data)

In [36]:
#Using the above function to get the train data and a train batch length to pass into model batch size which splits the 
# Train_batch/test_batch while fitting the model
train_batch,train_batch_len = createBatches(train_set)
test_batch,test_batch_len = createBatches(test_set)

In [37]:
#Looking at how many batches the data is split into
len(train_set)
len(train_batch_len)

38367

71

In [38]:
#THis function sorts the dataset by the length of sentences and then saves the length of each batch
def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,ch,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

In [39]:
# Hyper parameters used base on the paper minor changes as 
# the dataset i used to train is very small compared to the dataset they used for training

# EPOCHS = 50               # paper: 80
# DROPOUT = 0.5             # paper: 0.68
# DROPOUT_RECURRENT = 0.25  # not specified in paper, 0.25 recommended
# LSTM_STATE_SIZE = 200     # paper: 275
# CONV_SIZE = 3             # paper: 3
# LEARNING_RATE = 0.0105    # paper 0.0105
# OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended for smaller dataset

In [40]:
#Model layers 

#Word Embedding part
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)

#Case embedding part of each word
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)

#Character embedding using 1D CNN
character_input=Input(shape=(None,52,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

#Concatnating the embedded layer o/p together
output = concatenate([words, casing, char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)

#Specifying input and o/p of the model
model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])

#Compliling the model and looking at model summary
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, None, 52)]   0           []                               
                                                                                                  
 char_embedding (TimeDistribute  (None, None, 52, 30  2850       ['char_input[0][0]']             
 d)                             )                                                                 
                                                                                                  
 dropout (Dropout)              (None, None, 52, 30  0           ['char_embedding[0][0]']         
                                )                                                                 
                                                                                              

In [41]:
%%time
#Training the model on batches for 50 epochs 
epochs = 50
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing, char = batch       
        model.train_on_batch([tokens, casing, char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/50
 
Epoch 1/50
 
Epoch 2/50
 
Epoch 3/50
 
Epoch 4/50
 
Epoch 5/50
 
Epoch 6/50
 
Epoch 7/50
 
Epoch 8/50
 
Epoch 9/50
 
Epoch 10/50
 
Epoch 11/50
 
Epoch 12/50
 
Epoch 13/50
 
Epoch 14/50
 
Epoch 15/50
 
Epoch 16/50
 
Epoch 17/50
 
Epoch 18/50
 
Epoch 19/50
 
Epoch 20/50
 
Epoch 21/50
 
Epoch 22/50
 
Epoch 23/50
 
Epoch 24/50
 
Epoch 25/50
 
Epoch 26/50
 
Epoch 27/50
 
Epoch 28/50
 
Epoch 29/50
 
Epoch 30/50
 
Epoch 31/50
 
Epoch 32/50
 
Epoch 33/50
 
Epoch 34/50
 
Epoch 35/50
 
Epoch 36/50
 
Epoch 37/50
 
Epoch 38/50
 
Epoch 39/50
 
Epoch 40/50
 
Epoch 41/50
 
Epoch 42/50
 
Epoch 43/50
 
Epoch 44/50
 
Epoch 45/50
 
Epoch 46/50
 
Epoch 47/50
 
Epoch 48/50
 
Epoch 49/50
 
Wall time: 2h 23min 42s


In [105]:
#This function allows to pass each sentence in the test data and fetch the predicted o/p
def tag_dataset(dataset):
    sentences = []
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        sentences.append(tokens)
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes 
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    b.update(i+1)
    return predLabels, correctLabels, sentences

In [106]:
#Calling the above function
predLabels, correctLabels, sentences = tag_dataset(test_batch) 



In [129]:
#Returns the labels of index
def pred2label(pred,diction):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(diction[str(p)])
        out.append(out_i)
    return out

pred_labels = pred2label(predLabels,idx2Label)
true_labels = pred2label(correctLabels,idx2Label)
actual_words = pred2label(sentences,idx2Word)

In [132]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

print("F1-score: {:.1%}".format(f1_score(true_labels, pred_labels)))
print("precision_score: {:.1%}".format(precision_score(true_labels, pred_labels)))
print("recall_score: {:.1%}".format(recall_score(true_labels, pred_labels)))

F1-score: 75.5%
precision_score: 74.1%
recall_score: 77.0%


In [135]:
#Creating a flat list to look at Classification report
test_pred = [ j for i in pred_labels for j in i]
true_lab = [ j for i in true_labels for j in i]

['O',
 'B-tim',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'B-geo',
 'O',
 'B-tim',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'B-geo',
 'I-per',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 '

['O',
 'B-tim',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-eve',
 'O',
 'B-geo',
 'O',
 'B-geo',
 'O',
 'B-tim',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org',
 '

In [136]:
from sklearn.metrics import classification_report
print(classification_report(test_pred, true_lab))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         1
       B-eve       0.22      0.76      0.34        17
       B-geo       0.96      0.73      0.83      9917
       B-gpe       0.91      0.94      0.92      3075
       B-nat       0.24      0.25      0.24        40
       B-org       0.52      0.79      0.63      2629
       B-per       0.68      0.91      0.78      2513
       B-tim       0.84      0.91      0.87      3707
       I-art       0.04      0.67      0.07         3
       I-eve       0.02      1.00      0.04         1
       I-geo       0.82      0.58      0.68      2153
       I-gpe       0.34      1.00      0.51        15
       I-nat       0.00      0.00      0.00         0
       I-org       0.57      0.74      0.65      2526
       I-per       0.84      0.86      0.85      3311
       I-tim       0.61      0.87      0.72       931
           O       0.99      0.99      0.99    179236

    accuracy              

In [49]:
print("{:15}{:5}\t{}\n".format("Word", "True", "Pred"))
print("-"*30)

i = np.random.randint(0, len(actual_words))

for (w, t, pred) in zip(actual_words[i], true_labels[i], pred_labels[i]):
    print("{:15}{}\t{}".format(w, t, pred))

Word           True 	Pred

------------------------------
british        B-gpe	B-gpe
voters         O	O
are            O	O
casting        O	O
ballots        O	O
thursday       B-tim	B-tim
in             O	O
local          O	O
and            O	O
regional       O	O
elections      O	O
widely         O	O
seen           O	O
as             O	O
a              O	O
referendum     O	O
on             O	O
prime          B-per	B-per
minister       I-per	O
tony           I-per	B-per
blair          I-per	I-per
's             O	O
decade         O	O
in             O	O
office         O	O
.              O	O


In [50]:
model.save("HalfTrainedModels/Half_Trained_model_glove.h5")

In [51]:
#Training on full dataset
#Model layers 

#Word Embedding part
words_input_full = Input(shape=(None,),dtype='int32',name='words_input')
words_full = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input_full)

#Case embedding part of each word
casing_input_full = Input(shape=(None,), dtype='int32', name='casing_input')
casing_full = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input_full)

#Character embedding using 1D CNN
character_input_full = Input(shape=(None,52,),name='char_input')
embed_char_out_full = TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input_full)
dropout_full = Dropout(0.5)(embed_char_out_full)
conv1d_out_full = TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout_full)
maxpool_out_full =TimeDistributed(MaxPooling1D(52))(conv1d_out_full)
char_full = TimeDistributed(Flatten())(maxpool_out_full)
char_full = Dropout(0.5)(char_full)

#Concatnating the embedded layer o/p together
output_full = concatenate([words_full, casing_full, char_full])
output_full = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output_full)
output_full = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output_full)

#Specifying input and o/p of the model
Full_Trained_model_glove = Model(inputs=[words_input_full, casing_input_full, character_input_full], outputs=[output_full])

#Compliling the model and looking at model summary
Full_Trained_model_glove.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
Full_Trained_model_glove.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, None, 52)]   0           []                               
                                                                                                  
 char_embedding (TimeDistribute  (None, None, 52, 30  2850       ['char_input[0][0]']             
 d)                             )                                                                 
                                                                                                  
 dropout_4 (Dropout)            (None, None, 52, 30  0           ['char_embedding[0][0]']         
                                )                                                                 
                                                                                            

In [61]:
trainSentences = characterSplit(trainSentences)
full_train_data = padding(createMatrices(trainSentences ,word2Idx,  label2Idx, case2Idx, char2Idx))
full_train_data_batch,full_train_data_batch_len = createBatches(full_train_data)

In [62]:
%%time
#Training the model on batches for 50 epochs 
epochs = 50
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(full_train_data_batch_len))
    for i,batch in enumerate(iterate_minibatches(full_train_data_batch,full_train_data_batch_len)):
        labels, tokens, casing, char = batch       
        Full_Trained_model_glove.train_on_batch([tokens, casing, char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/50
 
Epoch 1/50
 
Epoch 2/50
 
Epoch 3/50
 
Epoch 4/50
 
Epoch 5/50
 
Epoch 6/50
 
Epoch 7/50
 
Epoch 8/50
 
Epoch 9/50
 
Epoch 10/50
 
Epoch 11/50
 
Epoch 12/50
 
Epoch 13/50
 
Epoch 14/50
 
Epoch 15/50
 
Epoch 16/50
 
Epoch 17/50
 
Epoch 18/50
 
Epoch 19/50
 
Epoch 20/50
 
Epoch 21/50
 
Epoch 22/50
 
Epoch 23/50
 
Epoch 24/50
 
Epoch 25/50
 
Epoch 26/50
 
Epoch 27/50
 
Epoch 28/50
 
Epoch 29/50
 
Epoch 30/50
 
Epoch 31/50
 
Epoch 32/50
 
Epoch 33/50
 
Epoch 34/50
 
Epoch 35/50
 
Epoch 36/50
 
Epoch 37/50
 
Epoch 38/50
 
Epoch 39/50
 
Epoch 40/50
 
Epoch 41/50
 
Epoch 42/50
 
Epoch 43/50
 
Epoch 44/50
 
Epoch 45/50
 
Epoch 46/50
 
Epoch 47/50
 
Epoch 48/50
 
Epoch 49/50
 
Wall time: 3h 41min 3s


In [63]:
Full_Trained_model_glove.save("TrainedModels/Full_Trained_model_glove.h5")

### Predicting on climate change data

In [64]:
from tensorflow import keras
LoadedModel = keras.models.load_model("TrainedModels/Full_Trained_model_glove.h5")

#Reading the dictionaries back 
with open('idx2Word_Glove.json', 'r') as fp:
    idx2Word = json.load(fp)
    
with open('idx2Label_Glove.json', 'r') as fp:
    idx2Label = json.load(fp)

In [92]:
#Readig the cleaned dataset
ClimateChange_DF = pd.read_csv("Cleaned_English_tweets.csv")
#keeping only the tweets
ClimateChange_DF = ClimateChange_DF[["CleanedTweets"]]
ClimateChange_DF.head()
ClimateChange_DF.shape

Unnamed: 0,CleanedTweets
0,News Trends Data Americans are less concerned ...
1,Do you realize that civil war is the devastati...
2,Having anxiety over the weather something they...
3,In the last few years I've noticed that studen...
4,FULL INTERVIEW BTS ARMY BTSonGMA NEWS EXCLUSIV...


(1557690, 1)

In [93]:
#selecting first 1000 tweets for ease of processing in next steps
ClimateChange_DF_Sliced = ClimateChange_DF.iloc[:10000].copy()

In [94]:
ClimateChange_DF_Sliced["Word"] = ClimateChange_DF_Sliced["CleanedTweets"].apply(lambda x: [[i] for i in str(x).split()])

In [95]:
ClimateChange_DF_Sliced.head()

Unnamed: 0,CleanedTweets,Word
0,News Trends Data Americans are less concerned ...,"[[News], [Trends], [Data], [Americans], [are],..."
1,Do you realize that civil war is the devastati...,"[[Do], [you], [realize], [that], [civil], [war..."
2,Having anxiety over the weather something they...,"[[Having], [anxiety], [over], [the], [weather]..."
3,In the last few years I've noticed that studen...,"[[In], [the], [last], [few], [years], [I've], ..."
4,FULL INTERVIEW BTS ARMY BTSonGMA NEWS EXCLUSIV...,"[[FULL], [INTERVIEW], [BTS], [ARMY], [BTSonGMA..."


In [96]:
climate_change = ClimateChange_DF_Sliced.Word.tolist()

In [97]:
#Splits the word into list of chatracters
def FcharacterSplit(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars]
    return Sentences

predicting_sentence = FcharacterSplit(climate_change)

In [98]:
#Addiing a dummy tag to process easilly
for i in predicting_sentence:
    for j in i:
        j.append('O')

In [99]:
climateChange_pred_data = padding(createMatrices(predicting_sentence, word2Idx, label2Idx, case2Idx,char2Idx))

In [100]:
i = np.random.randint(0, len(climateChange_pred_data))
print(i)
for j, data in enumerate(climateChange_pred_data[i:i+1]):
    tokens, casing, char, labels = data
    token = np.asarray([tokens])     
    casing = np.asarray([casing])
    char = np.asarray([char])
    pred = LoadedModel.predict([token, casing,char], verbose=False)[0]   
    pred = pred.argmax(axis=-1) #Predict the classes 

8902


In [101]:
#Returns the labels of index
def pred2label(pred,diction):
    out = []
    for pred_i in pred:
        out.append(diction[str(pred_i)])
    return out

pred_labels = pred2label(pred,idx2Label)
actual_words = pred2label(tokens,idx2Word)

In [102]:
print("{:15}{:15}{}".format("Original_Word", "Passed_Word", "Tag_Predicted"))
print("-"*30)

for (o, w, pred) in zip(ClimateChange_DF_Sliced.CleanedTweets[i].split(), actual_words, pred_labels):
    print("{:15}{:15}{}".format(o, w, pred))

#Using Spacy to cross check for entities
nlp = spacy.load('en_core_web_sm')
text = nlp(ClimateChange_DF_Sliced.CleanedTweets[i])
displacy.render(text, style = 'ent', jupyter=True)

Original_Word  Passed_Word    Tag_Predicted
------------------------------
FULL           full           O
INTERVIEW      interview      O
BTS            UNKNOWN_TOKEN  O
ARMY           army           I-org
BTSonGMA       UNKNOWN_TOKEN  I-org
NEWS           news           I-org
EXCLUSIVE      exclusive      O
sits           sits           O
down           down           O
with           with           O
pop            pop            O
superstars     UNKNOWN_TOKEN  O
and            and            O
South          south          O
Korean         korean         B-gpe
Pres           UNKNOWN_TOKEN  O
Moon           moon           B-per
Jaein          UNKNOWN_TOKEN  I-geo
as             as             O
they           they           O
speak          speak          O
on             on             O
tackling       tackling       O
tough          tough          O
issues         issues         O
from           from           O
COVID          UNKNOWN_TOKEN  B-geo
to             to             O
c