In [89]:
#Importing dependencies for EDA
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import json

import spacy
from spacy import displacy

# from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
# from prepro import readfile,createBatches,createMatrices,iterate_minibatches,addCharInformatioin,padding
from tensorflow.keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform

#Importing the below block to display all outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [57]:
#Reading the training file
df = pd.read_csv("/content/drive/MyDrive/NLP/ner_datasetreference.csv", encoding='latin')
df = df.fillna(method='ffill')
df["Sentence #"] = df["Sentence #"].apply(lambda s: s[9:]).astype("int32")
df["Word"] = df["Word"].apply(lambda x: re.sub(r'http\S+', ' ',x))

In [58]:
#Creating a dictionary to replace these latin characters 
replacement_dict = {'ë':'e','ü':'u',"\xa0":' ', 'é':'e', '\x93':' ','\x91':' ','\x97':' ','\x85':' ','\x94':' ','ö':'o' ,'°':' ', 
                   '\x92':' ','\x96':' '}

def cleanunicode(uncleanstring):
    return re.sub(r'[\xa0|é|ë|ü|\x93|\x91|\x97|\x85|\x94|ö|°|\x92|\x96]', lambda m: replacement_dict.get(m.group()), uncleanstring)

df['Word'] = df['Word'].apply(lambda x : cleanunicode(x))

In [59]:
# Creating a list of all unique words and unique tags
all_words = list(set(df["Word"].values))
all_tags = list(set(df["Tag"].values))
print("Number of unique words: {}".format(len(all_words)))
print("Number of unique tags : {}".format(len(all_tags)))

Number of unique words: 35162
Number of unique tags : 17


In [60]:
# Creating a dictionary of of unique words and assigning a unique index number similarly to NER tags as well 
word2Idx = {word: idx + 2 for idx, word in enumerate(all_words)}
word2Idx["UNKNOWN_WORD"]=0
word2Idx["PADDING"]= 1

print(dict(list(word2Idx.items())[0:5])) ## Dislaying 5 samples


{'24th': 2, '1991': 3, 'dispenser': 4, 'season': 5, 'lifeline': 6}


In [61]:
idx2Word = {v: k for k, v in word2Idx.items()}

with open('/content/drive/MyDrive/NLP/idx2Word_nonGlove.json', 'w') as fp:
    json.dump(idx2Word, fp)

In [62]:
len(word2Idx)

35164

In [63]:
# Word and tag is combined in a list for each sentence another list is consuructed
df['combined']= df[['Word','Tag']].values.tolist()
df = df.groupby(['Sentence #'])['combined'].agg(lambda x: list(x))

In [64]:
df.head()

Sentence #
1    [[Thousands, O], [of, O], [demonstrators, O], ...
2    [[Families, O], [of, O], [soldiers, O], [kille...
3    [[They, O], [marched, O], [from, O], [the, O],...
4    [[Police, O], [put, O], [the, O], [number, O],...
5    [[The, O], [protest, O], [comes, O], [on, O], ...
Name: combined, dtype: object

In [65]:
trainSentences = df.tolist()

In [14]:
len(trainSentences)

47959

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(trainSentences, test_size=0.2, random_state=123)

In [16]:
len(X_train)
len(X_test)

38367

9592

In [17]:
X_train[0]

[['No', 'O'],
 ['official', 'O'],
 ['announcement', 'O'],
 ['has', 'O'],
 ['come', 'O'],
 ['from', 'O'],
 ['the', 'O'],
 ['Iraqi', 'B-gpe'],
 ['Special', 'O'],
 ['Tribunal', 'O'],
 ['in', 'O'],
 ['charge', 'O'],
 ['of', 'O'],
 ['the', 'O'],
 ['trials', 'O'],
 [',', 'O'],
 ['but', 'O'],
 ['officials', 'O'],
 ['close', 'O'],
 ['to', 'O'],
 ['the', 'O'],
 ['case', 'O'],
 ['said', 'O'],
 ['Friday', 'B-tim'],
 ['that', 'O'],
 ['Saddam', 'B-per'],
 ['Hussein', 'I-per'],
 ['will', 'O'],
 ['be', 'O'],
 ['tried', 'O'],
 ['for', 'O'],
 ['the', 'O'],
 ['1982', 'B-tim'],
 ['killing', 'O'],
 ['of', 'O'],
 ['dozens', 'O'],
 ['of', 'O'],
 ['residents', 'O'],
 ['of', 'O'],
 ['the', 'O'],
 ['town', 'O'],
 ['of', 'O'],
 ['Dujail', 'B-geo'],
 ['.', 'O']]

In [18]:
#This function creates a list of characters in a word
def characterSplit(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars,data[1]]
    return Sentences

In [19]:
X_train = characterSplit(X_train)
X_test = characterSplit(X_test)

In [20]:
X_train[0]

[['No', ['N', 'o'], 'O'],
 ['official', ['o', 'f', 'f', 'i', 'c', 'i', 'a', 'l'], 'O'],
 ['announcement',
  ['a', 'n', 'n', 'o', 'u', 'n', 'c', 'e', 'm', 'e', 'n', 't'],
  'O'],
 ['has', ['h', 'a', 's'], 'O'],
 ['come', ['c', 'o', 'm', 'e'], 'O'],
 ['from', ['f', 'r', 'o', 'm'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['Iraqi', ['I', 'r', 'a', 'q', 'i'], 'B-gpe'],
 ['Special', ['S', 'p', 'e', 'c', 'i', 'a', 'l'], 'O'],
 ['Tribunal', ['T', 'r', 'i', 'b', 'u', 'n', 'a', 'l'], 'O'],
 ['in', ['i', 'n'], 'O'],
 ['charge', ['c', 'h', 'a', 'r', 'g', 'e'], 'O'],
 ['of', ['o', 'f'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['trials', ['t', 'r', 'i', 'a', 'l', 's'], 'O'],
 [',', [','], 'O'],
 ['but', ['b', 'u', 't'], 'O'],
 ['officials', ['o', 'f', 'f', 'i', 'c', 'i', 'a', 'l', 's'], 'O'],
 ['close', ['c', 'l', 'o', 's', 'e'], 'O'],
 ['to', ['t', 'o'], 'O'],
 ['the', ['t', 'h', 'e'], 'O'],
 ['case', ['c', 'a', 's', 'e'], 'O'],
 ['said', ['s', 'a', 'i', 'd'], 'O'],
 ['Friday', ['F', 'r', 'i', 'd', '

In [21]:
#This loops create a set of unique tags and a dictionary of unique words
labelSet = set()
words = {}

for dataset in [X_train, X_test]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

In [22]:
len(labelSet)
len(words)

17

31801

In [23]:
# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [24]:
idx2Label = {v: k for k, v in label2Idx.items()}

with open('/content/drive/MyDrive/NLP/idx2Label_nonGlove.json', 'w') as fp:
    json.dump(idx2Label, fp)

In [25]:
# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [26]:
case2Idx
caseEmbeddings

{'PADDING_TOKEN': 7,
 'allLower': 1,
 'allUpper': 2,
 'contains_digit': 6,
 'initialUpper': 3,
 'mainly_numeric': 5,
 'numeric': 0,
 'other': 4}

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [27]:
#simarlrly creating a character index dictionary of all possible english characters
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)
len(char2Idx)
char2Idx

95

{' ': 2,
 '!': 75,
 '"': 81,
 '#': 79,
 '$': 85,
 '%': 84,
 '&': 87,
 "'": 80,
 '(': 69,
 ')': 70,
 '*': 89,
 '+': 90,
 ',': 66,
 '-': 67,
 '.': 65,
 '/': 82,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 ':': 77,
 ';': 78,
 '=': 88,
 '?': 76,
 '@': 91,
 'A': 39,
 'B': 40,
 'C': 41,
 'D': 42,
 'E': 43,
 'F': 44,
 'G': 45,
 'H': 46,
 'I': 47,
 'J': 48,
 'K': 49,
 'L': 50,
 'M': 51,
 'N': 52,
 'O': 53,
 'P': 54,
 'PADDING': 0,
 'Q': 55,
 'R': 56,
 'S': 57,
 'T': 58,
 'U': 59,
 'UNKNOWN': 1,
 'V': 60,
 'W': 61,
 'X': 62,
 'Y': 63,
 'Z': 64,
 '[': 71,
 '\\': 83,
 ']': 72,
 '^': 92,
 '_': 68,
 '`': 86,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'k': 23,
 'l': 24,
 'm': 25,
 'n': 26,
 'o': 27,
 'p': 28,
 'q': 29,
 'r': 30,
 's': 31,
 't': 32,
 'u': 33,
 'v': 34,
 'w': 35,
 'x': 36,
 'y': 37,
 'z': 38,
 '{': 73,
 '|': 94,
 '}': 74,
 '~': 93}

In [28]:
with open('/content/drive/MyDrive/NLP/char2Idx_nonGlove.json', 'w') as fp:
    json.dump(char2Idx, fp)

In [29]:
def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]

In [30]:
def createMatrices(document, word2Idx, label2Idx, case2Idx,char2Idx):
    unknownIdx = word2Idx['UNKNOWN_WORD']
    paddingIdx = word2Idx['PADDING']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for document in document:
        wordIndices = []    
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in document:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
            charIdx = []
            for x in char:
                charIdx.append(char2Idx[x])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

In [31]:
key_lenth = [len(i) for i,k in word2Idx.items()]
keys = {len(i):i for i,k in word2Idx.items()}
keys[max(key_lenth)]

'cricketer-turned-politician'

In [32]:
#Padding the characters to length of 28
def padding(document):
    maxlen = 28
    for sentence in document:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(document):
        document[i][2] = pad_sequences(document[i][2],28,padding='post')
    return document

In [33]:
#Above three functions are called on the train and test data
train_set = padding(createMatrices(X_train,word2Idx,  label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(X_test, word2Idx, label2Idx, case2Idx,char2Idx))

In [34]:
#This function O/P the dataset and also o/p a list which contain the no of batches which has the same length words in i/p
def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

In [35]:
train_batch,train_batch_len = createBatches(train_set)
test_batch,test_batch_len = createBatches(test_set)

In [36]:
len(train_set)
len(train_batch_len)

38367

71

In [37]:
train_set[0]

[[32011,
  271,
  17732,
  6843,
  26720,
  12370,
  2107,
  29641,
  24636,
  14498,
  9609,
  8485,
  9758,
  2107,
  17562,
  13941,
  25034,
  22211,
  2847,
  34825,
  2107,
  11216,
  14902,
  15740,
  12767,
  26022,
  3298,
  12034,
  32325,
  11392,
  20627,
  2107,
  8089,
  14727,
  9758,
  31615,
  9758,
  12005,
  9758,
  2107,
  1172,
  9758,
  31857,
  9827],
 [3,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  3,
  3,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  3,
  3,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  4],
 array([[52, 27,  0, ...,  0,  0,  0],
        [27, 18, 18, ...,  0,  0,  0],
        [13, 26, 26, ...,  0,  0,  0],
        ...,
        [27, 18,  0, ...,  0,  0,  0],
        [42, 33, 22, ...,  0,  0,  0],
        [65,  0,  0, ...,  0,  0,  0]], dtype=int32),
 [7,
  7,
  7,
  7,
  7,
  7,
  7,
  0,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  6,
  7,
  5,
  1,
  7,


In [38]:
def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,ch,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

In [39]:
# Hyper parameters used base on the paper minor changes as 
# the dataset i used to train is very small compared to the dataset they used for training

# EPOCHS = 50               # paper: 80
# DROPOUT = 0.5             # paper: 0.68
# DROPOUT_RECURRENT = 0.25  # not specified in paper, 0.25 recommended
# LSTM_STATE_SIZE = 200     # paper: 275
# CONV_SIZE = 3             # paper: 3
# LEARNING_RATE = 0.0105    # paper 0.0105
# OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended for smaller dataset

In [40]:
#Word Embedding
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=len(word2Idx), output_dim=20)(words_input) #,  weights=[wordEmbeddings], trainable=False

#Case embedding of each words
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)

#Character embedding using 1D CNN
character_input=Input(shape=(None,28,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(28))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

#Concatnating the embedded layer o/p together
output = concatenate([words, casing, char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)

model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])

model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, None, 28)]   0           []                               
                                                                                                  
 char_embedding (TimeDistribute  (None, None, 28, 30  2850       ['char_input[0][0]']             
 d)                             )                                                                 
                                                                                                  
 dropout (Dropout)              (None, None, 28, 30  0           ['char_embedding[0][0]']         
                                )                                                                 
                                                                                              

In [41]:
%%time
#Training the model on batches
epochs = 50
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing,char = batch       
        history = model.train_on_batch([tokens, casing, char], labels,return_dict=True,reset_metrics=False)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/50
 
Epoch 1/50
 
Epoch 2/50
 
Epoch 3/50
 
Epoch 4/50
 
Epoch 5/50
 
Epoch 6/50
 
Epoch 7/50
 
Epoch 8/50
 
Epoch 9/50
 
Epoch 10/50
 
Epoch 11/50
 
Epoch 12/50
 
Epoch 13/50
 
Epoch 14/50
 
Epoch 15/50
 
Epoch 16/50
 
Epoch 17/50
 
Epoch 18/50
 
Epoch 19/50
 
Epoch 20/50
 
Epoch 21/50
 
Epoch 22/50
 
Epoch 23/50
 
Epoch 24/50
 
Epoch 25/50
 
Epoch 26/50
 
Epoch 27/50
 
Epoch 28/50
 
Epoch 29/50
 
Epoch 30/50
 
Epoch 31/50
 
Epoch 32/50
 
Epoch 33/50
 
Epoch 34/50
 
Epoch 35/50
 
Epoch 36/50
 
Epoch 37/50
 
Epoch 38/50
 
Epoch 39/50
 
Epoch 40/50
 
Epoch 41/50
 
Epoch 42/50
 
Epoch 43/50
 
Epoch 44/50
 
Epoch 45/50
 
Epoch 46/50
 
Epoch 47/50
 
Epoch 48/50
 
Epoch 49/50
 
CPU times: user 2h 5min 48s, sys: 24min 55s, total: 2h 30min 43s
Wall time: 1h 27min 2s


In [42]:
history

{'loss': 0.1432882696390152}

In [None]:
model.save("BiLSTM_CNN_nonGolVe/model.h5")

In [43]:
def predict_dataset(dataset):
    sentences = []
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        sentences.append(tokens)
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes 
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    b.update(i+1)
    return predLabels, correctLabels, sentences

In [44]:
predLabels, correctLabels, sentences = predict_dataset(test_batch) 



In [92]:
#Returns the labels of index
def pred2label(pred,diction):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(str(diction[p]))
        out.append(out_i)
    return out

pred_labels = pred2label(predLabels,idx2Label)
true_labels = pred2label(correctLabels,idx2Label)
actual_words = pred2label(sentences,idx2Word)

In [93]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

print("F1-score: {:.1%}".format(f1_score(true_labels, pred_labels)))
print("precision_score: {:.1%}".format(precision_score(true_labels, pred_labels)))
print("recall_score: {:.1%}".format(recall_score(true_labels, pred_labels)))

F1-score: 79.3%
precision_score: 79.5%
recall_score: 79.1%


In [94]:
#Creating a flat list to look at Classification report
test_pred = [ j for i in pred_labels for j in i]
true_lab = [ j for i in true_labels for j in i]

In [95]:
from sklearn.metrics import classification_report
print(classification_report(test_pred, true_lab))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         5
       B-eve       0.15      0.60      0.24        15
       B-geo       0.90      0.83      0.86      8162
       B-gpe       0.93      0.95      0.94      3143
       B-nat       0.00      0.00      0.00         1
       B-org       0.65      0.79      0.72      3279
       B-per       0.76      0.85      0.81      3009
       B-tim       0.88      0.90      0.89      3957
       I-art       0.00      0.00      0.00        10
       I-eve       0.02      0.33      0.04         3
       I-geo       0.79      0.77      0.78      1533
       I-gpe       0.43      0.90      0.58        21
       I-nat       0.00      0.00      0.00         0
       I-org       0.68      0.82      0.74      2715
       I-per       0.76      0.89      0.82      2914
       I-tim       0.71      0.81      0.76      1181
           O       0.99      0.98      0.99    180127

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
print("{:15}{:5}\t{}\n".format("Word", "True", "Pred"))
print("-"*30)

i = np.random.randint(0, len(actual_words))

for (w, t, pred) in zip(actual_words[i], true_labels[i], pred_labels[i]):
    print("{:15}{}\t{}".format(w, t, pred))

Word           True 	Pred

------------------------------
Austrian       B-gpe	B-gpe
officials      O	O
Wednesday      B-tim	B-tim
reported       O	O
the            O	O
first          O	O
European       B-org	B-org
Union          I-org	I-org
case           O	O
of             O	O
the            O	O
deadly         O	O
disease        O	O
in             O	O
poultry        O	O
.              O	O


In [53]:
model.save("/content/drive/MyDrive/NLP/HalfTrainedModels/Half_Trained_model_non_glove.h5")

In [68]:
#Training on full dataset
#Model layers 

#Word Embedding part
words_input_full = Input(shape=(None,),dtype='int32',name='words_input')
words_full = Embedding(input_dim=len(word2Idx), output_dim=20)(words_input_full) # weights=[wordEmbeddings], trainable=False)(words_input_full)

#Case embedding part of each word
casing_input_full = Input(shape=(None,), dtype='int32', name='casing_input')
casing_full = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input_full)

#Character embedding using 1D CNN
character_input_full = Input(shape=(None,28,),name='char_input')
embed_char_out_full = TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input_full)
dropout_full = Dropout(0.5)(embed_char_out_full)
conv1d_out_full = TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout_full)
maxpool_out_full =TimeDistributed(MaxPooling1D(28))(conv1d_out_full)
char_full = TimeDistributed(Flatten())(maxpool_out_full)
char_full = Dropout(0.5)(char_full)

#Concatnating the embedded layer o/p together
output_full = concatenate([words_full, casing_full, char_full])
output_full = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output_full)
output_full = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output_full)

#Specifying input and o/p of the model
Full_Trained_model_non_glove = Model(inputs=[words_input_full, casing_input_full, character_input_full], outputs=[output_full])

#Compliling the model and looking at model summary
Full_Trained_model_non_glove.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
Full_Trained_model_non_glove.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, None, 28)]   0           []                               
                                                                                                  
 char_embedding (TimeDistribute  (None, None, 28, 30  2850       ['char_input[0][0]']             
 d)                             )                                                                 
                                                                                                  
 dropout_4 (Dropout)            (None, None, 28, 30  0           ['char_embedding[0][0]']         
                                )                                                                 
                                                                                            

In [66]:
trainSentences = characterSplit(trainSentences)
full_train_data = padding(createMatrices(trainSentences ,word2Idx,  label2Idx, case2Idx, char2Idx))
full_train_data_batch,full_train_data_batch_len = createBatches(full_train_data)

In [69]:
%%time
#Training the model on full data with batches and for 50 epochs 
epochs = 50
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(full_train_data_batch_len))
    for i,batch in enumerate(iterate_minibatches(full_train_data_batch,full_train_data_batch_len)):
        labels, tokens, casing, char = batch       
        Full_Trained_model_non_glove.train_on_batch([tokens, casing, char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

Epoch 0/50
 
Epoch 1/50
 
Epoch 2/50
 
Epoch 3/50
 
Epoch 4/50
 
Epoch 5/50
 
Epoch 6/50
 
Epoch 7/50
 
Epoch 8/50
 
Epoch 9/50
 
Epoch 10/50
 
Epoch 11/50
 
Epoch 12/50
 
Epoch 13/50
 
Epoch 14/50
 
Epoch 15/50
 
Epoch 16/50
 
Epoch 17/50
 
Epoch 18/50
 
Epoch 19/50
 
Epoch 20/50
 
Epoch 21/50
 
Epoch 22/50
 
Epoch 23/50
 
Epoch 24/50
 
Epoch 25/50
 
Epoch 26/50
 
Epoch 27/50
 
Epoch 28/50
 
Epoch 29/50
 
Epoch 30/50
 
Epoch 31/50
 
Epoch 32/50
 
Epoch 33/50
 
Epoch 34/50
 
Epoch 35/50
 
Epoch 36/50
 
Epoch 37/50
 
Epoch 38/50
 
Epoch 39/50
 
Epoch 40/50
 
Epoch 41/50
 
Epoch 42/50
 
Epoch 43/50
 
Epoch 44/50
 
Epoch 45/50
 
Epoch 46/50
 
Epoch 47/50
 
Epoch 48/50
 
Epoch 49/50
 
CPU times: user 2h 38min 16s, sys: 31min 58s, total: 3h 10min 14s
Wall time: 1h 50min 37s


In [71]:
Full_Trained_model_non_glove.save("/content/drive/MyDrive/NLP/TrainedModels/Full_Trained_model_non_glove.h5")

In [None]:
from tensorflow import keras
LoadedModel = keras.models.load_model("TrainedModels/Full_Trained_model_non_glove.h5")

#Reading the dictionaries back 
with open('idx2Word_Glove.json', 'r') as fp:
    idx2Word = json.load(fp)
    
with open('idx2Label_Glove.json', 'r') as fp:
    idx2Label = json.load(fp)

In [72]:
#Readig the cleaned dataset
ClimateChange_DF = pd.read_csv("/content/drive/MyDrive/NLP_Files/NLP/TaskDataset/Cleaned_English_tweets.csv")
#keeping only the tweets
ClimateChange_DF = ClimateChange_DF[["CleanedTweets"]]
ClimateChange_DF.head()
ClimateChange_DF.shape

Unnamed: 0,CleanedTweets
0,News Trends Data Americans are less concerned ...
1,Do you realize that civil war is the devastati...
2,Having anxiety over the weather something they...
3,In the last few years I've noticed that studen...
4,FULL INTERVIEW BTS ARMY BTSonGMA NEWS EXCLUSIV...


(1557690, 1)

In [73]:
#selecting first 1000 tweets for ease of processing in next steps
ClimateChange_DF_Sliced = ClimateChange_DF.iloc[:10000].copy()

In [75]:
ClimateChange_DF_Sliced["Word"] = ClimateChange_DF_Sliced["CleanedTweets"].apply(lambda x: [[i] for i in str(x).split()])
ClimateChange_DF_Sliced.head()

Unnamed: 0,CleanedTweets,Word
0,News Trends Data Americans are less concerned ...,"[[News], [Trends], [Data], [Americans], [are],..."
1,Do you realize that civil war is the devastati...,"[[Do], [you], [realize], [that], [civil], [war..."
2,Having anxiety over the weather something they...,"[[Having], [anxiety], [over], [the], [weather]..."
3,In the last few years I've noticed that studen...,"[[In], [the], [last], [few], [years], [I've], ..."
4,FULL INTERVIEW BTS ARMY BTSonGMA NEWS EXCLUSIV...,"[[FULL], [INTERVIEW], [BTS], [ARMY], [BTSonGMA..."


In [76]:
climate_change = ClimateChange_DF_Sliced.Word.tolist()

In [77]:
#Splits the word into list of chatracters
def FcharacterSplit(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars]
    return Sentences

predicting_sentence = FcharacterSplit(climate_change)

In [78]:
#Addiing a dummy tag to process easilly
for i in predicting_sentence:
    for j in i:
        j.append('O')

In [79]:
climateChange_pred_data = padding(createMatrices(predicting_sentence, word2Idx, label2Idx, case2Idx,char2Idx))

In [82]:
i = np.random.randint(0, len(climateChange_pred_data))
print(i)
for j, data in enumerate(climateChange_pred_data[i:i+1]):
    tokens, casing, char, labels = data
    token = np.asarray([tokens])     
    casing = np.asarray([casing])
    char = np.asarray([char])
    pred = Full_Trained_model_non_glove.predict([token, casing,char], verbose=False)[0]   
    pred = pred.argmax(axis=-1) #Predict the classes 

9562


In [87]:
#Returns the labels of index
def pred2label(pred,diction):
    out = []
    for pred_i in pred:
        out.append(diction[pred_i])
    return out

pred_labels = pred2label(pred,idx2Label)
actual_words = pred2label(tokens,idx2Word)

In [90]:
print("{:15}{:15}{}".format("Original_Word", "Passed_Word", "Tag_Predicted"))
print("-"*30)

for (o, w, pred) in zip(ClimateChange_DF_Sliced.CleanedTweets[i].split(), actual_words, pred_labels):
    print("{:15}{:15}{}".format(o, w, pred))

#Using Spacy to cross check for entities
nlp = spacy.load('en_core_web_sm')
text = nlp(ClimateChange_DF_Sliced.CleanedTweets[i])
displacy.render(text, style = 'ent', jupyter=True)

Original_Word  Passed_Word    Tag_Predicted
------------------------------
The            The            O
problem        problem        O
with           with           O
Australia      Australia      B-geo
not            not            O
wanting        wanting        O
to             to             O
be             be             O
seen           seen           O
as             as             O
being          being          O
out            out            O
of             of             O
step           step           O
with           with           O
likeminded     UNKNOWN_WORD   O
democracies    democracies    O
on             on             O
climate        climate        O
change         change         O
i              UNKNOWN_WORD   O
