In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2>Import Libraries</h2>

In [None]:
import io
import re
import os
import nltk
import time
import math
import scipy
import string
import zipfile
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as pt
from collections import defaultdict
from gensim.models import KeyedVectors, Word2Vec, fasttext
import warnings
import zipfile
warnings.filterwarnings('ignore')

<h2>Importing Data</h2>

In [None]:
quora_train = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
quora_test = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")
quora_train.head(1)

<h2>Embedding Function</h2>

In [None]:
file_path = "/kaggle/input/quora-insincere-questions-classification/embeddings.zip"

def Embeddings(file_path,file):
    '''
    parameter : file_path(embedding file), 
                file = name of the file
    return : embedding_matrix(dictionary)
    ''' 
    embeddings_glove = dict()
    with zipfile.ZipFile(file_path,'r') as zf:
        if file == "glove":
            with io.TextIOWrapper(zf.open("glove.840B.300d/glove.840B.300d.txt"), encoding="utf-8") as f:
                for line in f:
                    values=line.split(' ') # ".split(' ')" only for glove-840b-300d; for all other files, ".split()" works
                    word=values[0]
                    vectors=np.asarray(values[1:],'float32')
                    embeddings_glove[word]=vectors
            return embeddings_glove
        
        elif file == "word2vec":
            embeddings_glove = KeyedVectors.load_word2vec_format(zf.open("GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"),binary=True)
            return embeddings_glove

        elif file == "paragram":
            path = zf.extract("paragram_300_sl999/paragram_300_sl999.txt")
            def get_coefs(word,*arr): 
                return word, np.asarray(arr, dtype='float32')
            embeddings_glove = dict(get_coefs(*w.split(" ")) for w in open(path, encoding='latin'))
            return embeddings_glove

        elif file=="fasttext":
            path = zf.extract("wiki-news-300d-1M/wiki-news-300d-1M.vec")
            def get_coefs(word,*arr): 
                return word, np.asarray(arr, dtype='float32')
            embeddings_glove = dict(get_coefs(*w.split(" ")) for w in open(path, encoding='latin'))
            return embeddings_glove

In [None]:
paragramModel = Embeddings(file_path,file="paragram")

<h2>Text Preprocessing</h2>

In [None]:
def str_lowercase_text(text):
    #mapping all text into str and lowercase
    text = text.map(str)
    #convert all the words to lower case first and then remove the stopwords
    for line in range(len(text.values)):
        text.values[line] = text.values[line].lower()
    return text
quora_train['question_text_paragram'] = str_lowercase_text(quora_train['question_text'])
quora_test['question_text_paragram'] = str_lowercase_text(quora_test['question_text'])
print("str_lowercase_text : Done")

#dictionary of special characters and their literal meaning to replace in the vocabulary
replacingWords = dict({"√":" sqrt ","π":" pi ","α":" alpha ","θ":" theta ","∞":" infinity ",
"∝":" proportional to ","sinx":" sin x ","cosx":" cos x ", "tanx":" tan x ","cotx":" cot x ", 
"secx":" sec x ", "cosecx":" cosec x ", "£":" pound ", "β":" beta ", "σ": " theta ", "∆":" delta ",
"μ":" mu ",'∫': " integration ", "ρ":" rho ", "λ":" lambda ","∩":" intersection ","Δ":" delta ", 
"φ":" phi ", "℃":" centigrade ","≠":" does not equal to ","Ω":" omega ","∑":" summation ",
"∪":" union ", "ψ":" psi ", "Γ":" gamma ","⇒":" implies ","∈":" is an element of ", 
"≡":" is congruent to ", "≈":" is approximately equal to ", "~":" is distributed as ",
"≅":" is isomorphic to ","⩽":" is less than or equal to ","≥":" is greater than or equal to ",
"⇐":" is implied by ","⇔":" is equivalent to ", "∉":" is not an element of ","∅" : " empty set ",
"∛":"cube root","÷":" division ","㏒":" log ","∇":" del ","⊆":" is a subset of ","±":" plus–minus ",
"⊂":" is a proper subset of ","€":" euro ","㏑":" ln ","₹":" rupee ","∀":" there exists ","∛":"cube root",
"⅓":" one by three ","½":" one by two ","∈":" is an element of ","¼":" one by four "})

def special_chars(text,symbols):
    for p in symbols:
        if p in text:
            text = text.replace(p, symbols[p])
            text = re.sub("\s{2}"," ",text)
    return text
quora_train['question_text_paragram'] = quora_train['question_text_paragram'].apply(lambda x: special_chars(x,replacingWords))
quora_test['question_text_paragram'] = quora_test['question_text_paragram'].apply(lambda x: special_chars(x,replacingWords))
print("special_chars : Done")

#characters to be removed
characters = "＝ా̫̾̀ͅ⚧ਿ∖⁡્⬇☉ూिାੁ͔☺͛ॢി「̷̊̆﻿َ«̰︡？◦✏ូ‬͒ِ℅„〖ௌ•­‐̗∧̯িֿ̔〗்“·้″∂͚̑ी∴ు́̕♡¯❓̦̓ை₊ுं”‌☁×ొ、್⌚​—’̶̋̐⎝ैು¸̞͑（⋅ृ′͋‘়͊➡†ী️ா̥：̻ू∗＾\
→´្ाাோ✌。⊥̵̛¬̒ാ–»！∨・❤̝ంి̮⎞」͆।☝่̙˚̬͌͘¦്，̴̂˜ੀ│ីৃ⃗᠌¡ੰ̧♀✔̈́̓）⊨✅￼⎛ិ↓्̉ुॣ；ّุ∡̭∘－؟△⋯ॉॄ✓∠̲̺®‏♭̱̍ു్̹̌̚͜ौ⁻ె⟨͈́⎠ँ™͝ំಿ¢̿ં͠↑،ើ̪ਂ₩̄̎‑̢ಾ¾₱̃ো︠ਾ≱͖ः¨⁠ાி͡ে਼ៃ̣͂》♏̜\
̜§͕‪《͎≤̇〇／ْ₦̼¥▾…ះี̖̘͇̽͐ា‛♨‰़̤̳̅⧽−ೋ̀▒ٌู̩⅔♣̡ั∼͉¿͗°☹⟩্©¶ो˂＞े◌‎̸ీ⧼＄̟̈⦁"

#characters to be retained and spaced between words
character_list = ['^', ')', '@', ',', '$', '+', '/', '?', '"', ';', '[', '%', '*', ']', "'", '>', '|', '=', '<', '.',
                  '&', '`', '\\', '#', '}', '-', '!', ':', '{', '(', '_']

#removing punctuation
def remove_punct(text,punToBeRemove):
    translate_table = dict((ord(char), None) for char in punToBeRemove) 
    #Loop to iterate 
    for idx,val in (enumerate(text.values)):
        val = val.translate(translate_table)
        text.values[idx] = val.strip()
    return text
       
quora_train['question_text_paragram'] = remove_punct(quora_train['question_text_paragram'], characters)
quora_test['question_text_paragram'] = remove_punct(quora_test['question_text_paragram'], characters)   
print("remove_punct : Done")

#Source: - https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
contractions = {"'aight": 'alright', "ain't": 'am not', "amn't": 'am not', "aren't": 'are not', "can't": 'can not',
"'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "daren't": 
'dare not', "daresn't": 'dare not', "dasn't": 'dare not', "didn't": 'did not', "doesn't": 'does not', 
"don't": 'do not', 'dunno': "don't know", "d'ye": 'do you', "e'er": 'ever', "everybody's": 'everybody is', 
"everyone's": 'everyone is', 'finna': 'fixing to', "g'day": 'good day', 'gimme': 'give me', "giv'n": 'given', 
'gonna': 'going to', "gon't": 'go not', 'gotta': 'got to', "hadn't": 'had not', "had've": 'had have', 
"hasn't": 'has not', "haven't": 'have not', "he'd": 'he had', "he'll": 'he will', "he's": 'he is', 
"he've": 'he have', "how'd": 'how did', 'howdy': 'how do you do', "how'll": 'how will', "how're": 'how are', 
"how's": 'how is', "I'd": 'I had', "I'd've": 'I would have', "I'll": 'I will', "I'm": 'I am', 
"I'm'a": 'I am about to', "I'm'o": 'I am going to', 'innit': 'is it not', "I've": 'I have', "isn't": 'is not', 
"it'd ": 'it would', "it'll": 'it will', "it's ": 'it is', 'iunno': "I don't know", "let's": 'let us', 
"ma'am": 'madam', "mayn't": 'may not', "may've": 'may have', 'methinks': 'me thinks', "mightn't": 'might not', 
"might've": 'might have', "mustn't": 'must not', "mustn't've": 'must not have', "must've": 'must have', 
"needn't": 'need not', 'nal': 'and all', "ne'er": 'never', "o'clock": 'of the clock', "o'er": 'over',
"ol'": 'old', "oughtn't": 'ought not', "'s": 'is', "shalln't": 'shall not', "shan't": 'shall not', 
"she'd": 'she would', "she'll": 'she will', "she's": 'she is', "should've": 'should have', 
"shouldn't": 'should not', "shouldn't've": 'should not have', "somebody's": 'somebody has', 
"someone's": 'someone has', "something's": 'something has', "so're": 'so are', "that'll": 'that will', 
"that're": 'that are', "that's": 'that is', "that'd": 'that would', "there'd": 'there would', 
"there'll": 'there will', "there're": 'there are', "there's": 'there is', "these're": 'these are', 
"they've": 'they have', "this's": 'this is', "those're": 'those are', "those've": 'those have', "'tis": 'it is', 
"to've": 'to have', "'twas": 'it was', 'wanna': 'want to', "wasn't": 'was not', "we'd": 'we would', 
"we'd've": 'we would have', "we'll": 'we will', "we're": 'we are', "we've": 'we have', "weren't": 'were not', 
"what'd": 'what did', "what'll": 'what will', "what're": 'what are', "what's": 'what does', "what've": 'what have',
"when's": 'when is', "where'd": 'where did', "where'll": 'where will', "where're": 'where are',
"where's": 'where is',"where've": 'where have', "which'd": 'which would', "which'll": 'which will', 
"which're": 'which are',"which's": 'which is', "which've": 'which have', "who'd": 'who would',
"who'd've": 'who would have', "who'll": 'who will', "who're": 'who are', "who'ves": 'who is', "who'": 'who have',
"why'd": 'why did', "why're": 'why are', "why's": 'why does', "willn't": 'will not', "won't": 'will not',
'wonnot': 'will not', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have',
"y'all": 'you all', "y'all'd've": 'you all would have', "y'all'd'n've": 'you all would not have',
"y'all're": 'you all are', "cause":"because","have't":"have not","cann't":"can not","ain't":"am not",
"you'd": 'you would', "you'll": 'you will', "you're": 'you are', "you've": 'you have', 'cannot': 'can not', 
'wont': 'will not', "You'": 'Am not', "Ain'": 'Am not', "Amn'": 'Am not', "Aren'": 'Are not',
"Can'": 'Because', "Could'": 'Could have', "Couldn'": 'Could not have', "Daren'": 'Dare not', 
"Daresn'": 'Dare not', "Dasn'": 'Dare not', "Didn'": 'Did not', "Doesn'": 'Does not', "Don'": "Don't know", 
"D'": 'Do you', "E'": 'Ever', "Everybody'": 'Everybody is', "Everyone'": 'Fixing to', "G'": 'Give me', 
"Giv'": 'Going to', "Gon'": 'Got to', "Hadn'": 'Had not', "Had'": 'Had have', "Hasn'": 'Has not', 
"Haven'": 'Have not', "He'": 'He have', "How'": 'How is', "I'": 'I have', "Isn'": 'Is not', "It'": "I don't know", 
"Let'": 'Let us', "Ma'": 'Madam', "Mayn'": 'May not', "May'": 'Me thinks', "Mightn'": 'Might not', 
"Might'": 'Might have', "Mustn'": 'Must not have', "Must'": 'Must have', "Needn'": 'And all', "Ne'": 'Never',
"O'": 'Old', "Oughtn'": 'Is', "Shalln'": 'Shall not', "Shan'": 'Shall not', "She'": 'She is', 
"Should'": 'Should have', "Shouldn'": 'Should not have', "Somebody'": 'Somebody has', "Someone'": 'Someone has', 
"Something'": 'Something has', "So'": 'So are', "That'": 'That would', "There'": 'There is',
"They'": 'They have', "This'": 'This is', "Those'": 'It is', "To'": 'Want to', "Wasn'": 'Was not',
"Weren'": 'Were not', "What'": 'What have', "When'": 'When is', "Where'": 'Where have', "Which'": 'Which have', 
"Who'": 'Who have', "Why'": 'Why does', "Willn'": 'Will not', "Won'": 'Will not', "Would'": 'Would have',
"Wouldn'": 'Would not have', "Y'": 'You all are',"What's":"What is","What're":"What are","what's":"what is",
"what're":"what are", "Who're":"Who are", "your're":"you are","you're":"you are", "You're":"You are",
"We're":"We are", "These'": 'These have', "we're":"we are","Why're":"Why are","How're":"How are ",
"how're ":"how are ","they're ":"they are ", "befo're":"before","'re ":" are ",'don"t ':"do not", 
"Won't ":"Will not ","could't":"could not", "would't":"would not", "We'": 'We have',"Hasn't":"Has not",
"n't":"not", 'who"s':"who is"}

#function to remove contractions
def decontraction(text,contractions):
    #Loop to iterate 
    for idx,val in enumerate(text.values):
        val = ' '.join(word.replace(word,contractions[word]) if word in contractions
                    else word for word in val.split())
        #generic one
        val = re.sub(r"\'s", " ", val);val = re.sub(r"\''s", " ", val);val = re.sub(r"\"s", " ", val)
        val = re.sub(r"n\'t", " not ", val);val = re.sub(r"n\''t", " not ", val);val = re.sub(r"n\"t", " not ", val)
        val = re.sub(r"\'re ", " are ", val);val = re.sub(r"\'d ", " would", val);val = re.sub(r"\''d ", " would", val)
        val = re.sub(r"\"d ", " would", val);val = re.sub(r"\'ll ", " will", val);val = re.sub(r"\''ll ", " will", val)
        val = re.sub(r"\"ll ", " will", val);val = re.sub(r"\'ve ", " have", val);val = re.sub(r"\''ve ", " have", val)
        val = re.sub(r"\"ve ", " have", val);val = re.sub(r"\'m ", " am", val);val = re.sub(r"\''m "," am", val)
        val = re.sub(r"\"m "," am", val);val = re.sub("\s{2}"," ",val)
        text.values[idx] = val.strip() 
    return text

"""function to replace special characters  with their respective meanings"""
def spacing_of_chars(text,characters_list):
    for char in characters_list:
        if char in text:
            text = text.replace(char," "+char+" ")
            text = re.sub("\s+"," ",text)
    return text

quora_train['question_text_paragram'] = quora_train['question_text_paragram'].apply(lambda x: spacing_of_chars(x,character_list))
quora_test['question_text_paragram'] = quora_test['question_text_paragram'].apply(lambda x: spacing_of_chars(x,character_list))
print("spacing_of_chars : Done")


#checking coverage for words present in question_text and in embedding_matrix
def coverage(vocab, embeddings_index,print_statement=False):
    #Initializing values
    known_words = defaultdict(int)
    unknown_words = defaultdict(int)
    knownWordsVal = 0
    unknownWordsVal = 0
    #iterating words
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            knownWordsVal += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            unknownWordsVal += vocab[word]
            pass
    
    if print_statement == True:
        print('Found {:.2%} of words in the embedding of the question text vocab'
           .format(len(known_words) / len(vocab)))
        print('Found {:.2%} of the words in the question text vocab'.format(knownWordsVal / (knownWordsVal + unknownWordsVal)))
    else:
        pass
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words


#function to generate vocabulary of question text
def question_text_vocab(text):
    freq_dict = defaultdict(int)
    total_sent = text.apply(lambda x: x.split()).values
    for sent in total_sent:
        for token in sent:
            freq_dict[token] += 1
    return freq_dict


#question text vocab
TextVocabTrain = question_text_vocab(quora_train['question_text_paragram'])
TextVocabTest = question_text_vocab(quora_test['question_text_paragram'])
print("question_text_vocab : Done")


#coverage function
OOVGloveTrain = coverage(TextVocabTrain, paragramModel)
OOVGloveTest = coverage(TextVocabTest, paragramModel)
print("coverage : Done")

#checking for the words that are present in the embeddings with lowercase or as title
def check_oov_vocab(vocab, embeddings):
    freq_dict = defaultdict()
    for word in vocab:
        if word[0].istitle() == True:
            if word[0].lower() in embeddings:
                freq_dict[word[0]]= word[0].lower()
        elif word[0].islower() == True:
            if word[0].title() in embeddings:
                freq_dict[word[0]]= word[0].title()
    return freq_dict

WordDictTrain = check_oov_vocab(OOVGloveTrain,paragramModel)
WordDictTest = check_oov_vocab(OOVGloveTest,paragramModel)
print("check_oov_vocab : Done")

def replace_words(text,set_words):
    for idx,val in enumerate(text.values):
        val = ' '.join(word.replace(word,set_words[word]) if word in set_words else word for word in val.split())
        text.values[idx] = val
    return text

quora_train['question_text_paragram'] = replace_words(quora_train['question_text_paragram'], WordDictTrain)
quora_test['question_text_paragram'] = replace_words(quora_test['question_text_paragram'], WordDictTest)
print("replace_words : Done")

replace_word = dict({"quorans":"quora", "brexit":"britain exit", "cryptocurrencies":"cryptocurrency", 
"blockchain":"blockchain", "demonetisation":"demonetization", "pokémon":"pokemon",
"qoura":"quora", "fiancé":"fiance","cryptocurrency":"cryptocurrency", "x²":"x squeare", 
"quoras":"quora","whst":"what", "²":"square", "Demonetization":"demonetization", 
"brexit":"britain exit", "são":"sao","genderfluid":"Gender fluid", "howcan":"How can", 
"undergraduation":"under graduation", "whydo":"why do", "à":"a","chapterwise":"chapter wise",
"cryptocurrencies":"cryptocurrency", "fiancée":"fiance", "wouldwin":"would win", 
"nanodegree":"nano degree","nanodegree":"nano degree", "blockchains":"blockchain", 
"clichés":"cliche", "erdoğan":"erdogan", "beyoncé":"beyonce", "fullform":"full form",
"atatürk":"ataturk", "Whyis":"Why is","amfrom":"am from", "2k17":"2017", 
"demonitization":"demonetization", "cliché":"cliche", "montréal":"montreal", 
"thé":"the", "am17":"am 17", "willhappen":"will happen","³":"cube", "whatapp":"whatsapp", 
"ε":"epsilon", "whatsaap":"whatsapp",'Σ':"summation","quorians":"quora users",
"cryptocurreny":"cryptocurrency", "mastuburation":"masturbation","Whatre":"What are", 
"whatdo":"what do","δ":"delta","oversmart":"over smart","¹":"one","baahubali":"baahubali", 
"note4":"note 4", "gdpr":"general data protection regulation", "bnbr":"' be nice , be respectful '", 
"uceed":"undergraduate common entrance examination for design","bhakts":"bhakts", 
"iiest":"indian institutes of engineering science and technology","bhakths":"bhakts",
"upwork":"Upwork","unacademy":"Unacademy","squeare":"square","srmjeee":"srmjee",
"demonitisation":"demonetization",
"cos2x":"cos 2x","padmavat":"padmaavat", "flipcart":"flipkart",
"havegot":"have got","2k18":"2018","a²":"a square","whydoes":"why does","sina":"sin a",
"class9":"class 9"})

#replacing words with correct once
quora_train['question_text_paragram'] = replace_words(quora_train['question_text_paragram'], replace_word)
quora_test['question_text_paragram'] = replace_words(quora_test['question_text_paragram'], replace_word)
print("replace_words : Done")

<h2>Tensorflow Dependencies</h2>

In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.backend import clear_session, maximum
from tensorflow.keras.callbacks import EarlyStopping 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras import Model, initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Input, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding
from tensorflow.keras.layers import Concatenate, LSTM, Activation, GRU, Reshape, Lambda, Multiply, Bidirectional, Maximum
from tensorflow.keras.preprocessing.text import Tokenizer

<h2>Splitting of data</h2>

In [None]:
from sklearn.model_selection import train_test_split
y = quora_train['target']
X = quora_train.drop(columns = ['target'])

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.20, stratify=y)
print("The shape of train,cv & test dataset before conversion into vector")
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(quora_test.shape)

<h2>Tokenization and Sequence Padding</h2>

In [None]:
#define values
maxlength = 75
embedding_dim = 300

In [None]:
def TokenizationPadding(data, maxlen): #pass data in list [X_train[""],X_cv[""],X_test[""]]
    
    #tokenizing dataset
    encoder_data = list()
    tokens = Tokenizer()
    tokens.fit_on_texts(data[0])
    for idx,val in enumerate(data):
        encoder_data.append(tokens.texts_to_sequences(val))
    
    #vocab_size
    vocab_size = len(tokens.word_index)+1
    
    #sequence padding
    seq_padding = list()
    for val in encoder_data:
        seq_padding.append(pad_sequences(val, maxlen=maxlen, padding='post'))
    
    return seq_padding, vocab_size, tokens

In [None]:
data = [X_train["question_text_paragram"], X_cv["question_text_paragram"], quora_test["question_text_paragram"]]

In [None]:
seq_padding, vocab_size, tokenizer = TokenizationPadding(data, maxlength)
Xtrain, Xcv, Xtest = seq_padding[0], seq_padding[1], seq_padding[2]

<h2>Embedding Matrix</h2>

In [None]:
def embedding_matrix(vocab_size,model,dim,tokenizer):
    #getting embedding matrix
    keys = set(model.keys())
    emb_matrix = np.zeros((vocab_size,dim))
    for idx,val in tokenizer.word_index.items():
        if idx in keys:
            #vector
            emb_vector = model[idx]
            #matrix
            emb_matrix[val] = emb_vector
            
    print('The shape of emdedding matrix is: ',emb_matrix.shape)
    return emb_matrix

In [None]:
embedding_matrix_paragram = embedding_matrix(vocab_size, paragramModel, embedding_dim, tokenizer)

<h2>Converting output to categorical data</h2>

In [None]:
from tensorflow.keras.utils import to_categorical
ytrain = to_categorical(y_train, 2)
ycv = to_categorical(y_cv, 2)

<h3>Callbacks</h3>

In [None]:
#Callback function
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, f1_score
from tensorflow.keras.callbacks import Callback

class accuracy_value(Callback):

    def __init__(self,training_data,validation_data):
        self.X_train = training_data[0]
        self.y_train = training_data[1]
        self.X_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_train_begin(self, logs = {}):
        self.f1_scores = []
        self.precisions = []
        self.recalls = []

    def on_epoch_end(self, epoch, logs = {}):
        #F1 Score
        y_predicted = np.asarray(self.model.predict(self.X_val)).round()
        f1_val = f1_score(self.y_val,y_predicted,average=None)
        self.f1_scores.append(f1_val)

        print(" - f1 score : {}".format(np.round(f1_val,4)))

f1Score = accuracy_value(training_data=(Xtrain, ytrain), validation_data=(Xcv, ycv))

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')

<h2>LSTM with Bahdanau Attention Model</h2>

In [None]:
class Attention(tf.keras.layers.Layer):
    '''
    Class the calculates score based on the scoring_function using Bahdanu attention mechanism.
    '''
    def __init__(self, att_units):
        super(Attention, self).__init__()
        #number of attention units to be provided
        self.att_units = att_units
        
        # Intialize variables needed for Concat score function here
        self.W1=tf.keras.layers.Dense(att_units)
        self.W2=tf.keras.layers.Dense(att_units)
        self.V=tf.keras.layers.Dense(1)

  
    def call(self,lstm_output, hidden_state):
        
        """
        hidden state shape == (batch_size, hidden size)
        lstm output shape == (batch_size, max_length, hidden size)
        """
        
        #state_with_time_axis shape == (batch_size, 1, hidden size)
        #we are doing this to broadcast addition along the time axis to calculate the score
        state_with_time_axis = tf.expand_dims(hidden_state, 1)
        
        #score shape == (batch_size, max_length, 1)
        #we get 1 at the last axis because we are applying score to self.V
        #the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(state_with_time_axis) + self.W2(lstm_output)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights=tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights*lstm_output
        context_vector = tf.reduce_sum(context_vector, axis=1)  

        return context_vector, attention_weights

In [None]:
clear_session()
#input
inputs = Input(shape=(maxlength,), dtype='int32', name='Input_Text')
#embedding layer
Embedding_Layer = Embedding(vocab_size, 300, weights=[embedding_matrix_paragram], input_length=maxlength, trainable=False)(inputs)
#bidirectional lstm with cell and hidden state 
lstm_output, fw_state_h, fw_state_c, bw_state_h, bw_state_c = Bidirectional(LSTM(64, return_sequences=True, return_state=True))(Embedding_Layer)
state_h = Concatenate()([fw_state_h, bw_state_h])
#getting context vector from attention model
context_vector, attention_weights = Attention(10)(lstm_output, state_h)
#dense layer
dense = Dense(64, activation='relu', name= "Dense_Layer")(context_vector)
output = Dense(2, activation='sigmoid', name= "Output_Layer")(dense)
model = Model(inputs,output)

#compiling model
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"]) #compile the model
#fitting the model
model.fit(Xtrain, ytrain, batch_size=512, verbose=1, epochs=10,validation_data=(Xcv,ycv), shuffle=True, callbacks=[f1Score, earlyStopping])

In [None]:
threshold = dict()
ypred = model.predict(Xcv, batch_size=512,verbose=1)
from sklearn import metrics
for thresh in np.arange(0.1, 0.501, 0.05):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, round(list(metrics.f1_score(ycv, (ypred>thresh).astype(int), average=None))[1],3)))
    threshold[thresh] = round(list(metrics.f1_score(ycv, (ypred>thresh).astype(int), average=None))[1],3)

print("\nThe best threshold is:",max(threshold, key=threshold.get))

#printing classification report using the best threshold
ypredicted = (ypred>max(threshold, key=threshold.get)).astype(int)
print("Classification Report:\n",metrics.classification_report(ycv,ypredicted))

In [None]:
#predicting test data
ypredict = list()
ypred = model.predict(Xtest, batch_size=512,verbose=1)
for i in ypred:
    ypredict.append((i[1]>max(threshold, key=threshold.get)).astype(int))

#creating dataframe
df_test = pd.DataFrame({"qid":quora_test["qid"].values})
df_test['prediction'] = ypredict
print("Quora Test Output:\n",df_test['prediction'].value_counts())

In [None]:
df_test.to_csv('submission.csv', index=False)