# Embedding + Bidirectional GRU

In [2]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


#### Reading data

In [3]:
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
train_df = pd.read_csv(dir_data+'train.csv')
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
test_df = pd.read_csv(dir_data+'test.csv')

#### Preprocessing

Contractions

In [4]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

len(contraction_mapping)

162

In [5]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
test_df['question_text'] = test_df['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

Lower

In [6]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: x.lower())
test_df['question_text'] = train_df['question_text'].apply(lambda x: x.lower())

Punctuations

In [7]:
punct_mapping = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping += '©^®` <→°€™› ♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'
puncts = {"‘": "'", "´": "'", "°": "", "€": "e", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '…': ' '}

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    return text

In [8]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_special_chars(x, punct_mapping, puncts))
test_df['question_text'] = test_df['question_text'].apply(lambda x: clean_special_chars(x, punct_mapping, puncts))

#### Tokenizing the words for posterior embedding

In [9]:
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['target'], random_state=123)

embed_size = 300 
max_features = 50000 
maxlen = 100 

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Target
train_y = train_df['target'].values
val_y = val_df['target'].values

## Shuffling data
np.random.seed(42)

trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
train_y = train_y[trn_idx]
val_X = val_X[val_idx]
val_y = val_y[val_idx]

#### Defining a basic model

In [10]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(GRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140160    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)  

#### Training

In [11]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1044897 samples, validate on 261225 samples
Epoch 1/2


 104448/1044897 [=>............................] - ETA: 1:35:40 - loss: 0.6997 - acc: 0.29 - ETA: 1:07:16 - loss: 0.6862 - acc: 0.57 - ETA: 58:12 - loss: 0.6741 - acc: 0.6986 - ETA: 52:43 - loss: 0.6622 - acc: 0.76 - ETA: 50:04 - loss: 0.6520 - acc: 0.79 - ETA: 47:49 - loss: 0.6409 - acc: 0.81 - ETA: 46:31 - loss: 0.6299 - acc: 0.83 - ETA: 45:33 - loss: 0.6188 - acc: 0.84 - ETA: 44:33 - loss: 0.6078 - acc: 0.85 - ETA: 43:53 - loss: 0.5953 - acc: 0.86 - ETA: 43:16 - loss: 0.5831 - acc: 0.87 - ETA: 42:55 - loss: 0.5701 - acc: 0.87 - ETA: 42:35 - loss: 0.5579 - acc: 0.88 - ETA: 42:19 - loss: 0.5454 - acc: 0.88 - ETA: 42:02 - loss: 0.5314 - acc: 0.88 - ETA: 41:53 - loss: 0.5180 - acc: 0.89 - ETA: 41:41 - loss: 0.5048 - acc: 0.89 - ETA: 41:37 - loss: 0.4925 - acc: 0.89 - ETA: 41:33 - loss: 0.4793 - acc: 0.89 - ETA: 41:26 - loss: 0.4676 - acc: 0.90 - ETA: 41:24 - loss: 0.4563 - acc: 0.90 - ETA: 41:13 - loss: 0.4470 - acc: 0.90 - ETA: 41:06 - loss: 0.4378 - acc: 0.90 - ETA: 41:01 - loss: 0.42

 208896/1044897 [====>.........................] - ETA: 38:50 - loss: 0.2039 - acc: 0.93 - ETA: 38:49 - loss: 0.2035 - acc: 0.93 - ETA: 38:47 - loss: 0.2031 - acc: 0.93 - ETA: 38:46 - loss: 0.2028 - acc: 0.93 - ETA: 38:44 - loss: 0.2025 - acc: 0.93 - ETA: 38:43 - loss: 0.2022 - acc: 0.93 - ETA: 38:42 - loss: 0.2018 - acc: 0.93 - ETA: 38:41 - loss: 0.2014 - acc: 0.93 - ETA: 38:40 - loss: 0.2010 - acc: 0.93 - ETA: 38:39 - loss: 0.2008 - acc: 0.93 - ETA: 38:38 - loss: 0.2005 - acc: 0.93 - ETA: 38:37 - loss: 0.2001 - acc: 0.93 - ETA: 38:36 - loss: 0.1998 - acc: 0.93 - ETA: 38:35 - loss: 0.1996 - acc: 0.93 - ETA: 38:33 - loss: 0.1993 - acc: 0.93 - ETA: 38:32 - loss: 0.1990 - acc: 0.93 - ETA: 38:31 - loss: 0.1987 - acc: 0.93 - ETA: 38:30 - loss: 0.1985 - acc: 0.93 - ETA: 38:29 - loss: 0.1981 - acc: 0.93 - ETA: 38:28 - loss: 0.1977 - acc: 0.93 - ETA: 38:27 - loss: 0.1974 - acc: 0.93 - ETA: 38:26 - loss: 0.1970 - acc: 0.93 - ETA: 38:25 - loss: 0.1967 - acc: 0.93 - ETA: 38:24 - loss: 0.1965 - a

















Epoch 2/2


 104448/1044897 [=>............................] - ETA: 49:36 - loss: 0.1228 - acc: 0.95 - ETA: 46:36 - loss: 0.1150 - acc: 0.95 - ETA: 46:10 - loss: 0.1173 - acc: 0.94 - ETA: 45:32 - loss: 0.1090 - acc: 0.95 - ETA: 45:23 - loss: 0.1073 - acc: 0.95 - ETA: 45:02 - loss: 0.1014 - acc: 0.95 - ETA: 44:50 - loss: 0.0966 - acc: 0.96 - ETA: 44:49 - loss: 0.0967 - acc: 0.96 - ETA: 44:32 - loss: 0.0968 - acc: 0.95 - ETA: 44:20 - loss: 0.0975 - acc: 0.95 - ETA: 44:13 - loss: 0.1002 - acc: 0.95 - ETA: 44:12 - loss: 0.0982 - acc: 0.95 - ETA: 44:14 - loss: 0.0982 - acc: 0.96 - ETA: 44:16 - loss: 0.0978 - acc: 0.96 - ETA: 44:13 - loss: 0.0983 - acc: 0.96 - ETA: 44:12 - loss: 0.0981 - acc: 0.96 - ETA: 44:05 - loss: 0.0970 - acc: 0.96 - ETA: 44:00 - loss: 0.0965 - acc: 0.96 - ETA: 44:00 - loss: 0.0972 - acc: 0.96 - ETA: 43:55 - loss: 0.0975 - acc: 0.96 - ETA: 43:50 - loss: 0.0967 - acc: 0.96 - ETA: 43:45 - loss: 0.0969 - acc: 0.96 - ETA: 43:42 - loss: 0.0968 - acc: 0.96 - ETA: 43:42 - loss: 0.0964 - a

 208896/1044897 [====>.........................] - ETA: 39:57 - loss: 0.0981 - acc: 0.96 - ETA: 39:56 - loss: 0.0982 - acc: 0.96 - ETA: 39:54 - loss: 0.0982 - acc: 0.96 - ETA: 39:53 - loss: 0.0981 - acc: 0.96 - ETA: 39:53 - loss: 0.0981 - acc: 0.96 - ETA: 39:51 - loss: 0.0980 - acc: 0.96 - ETA: 39:50 - loss: 0.0978 - acc: 0.96 - ETA: 39:48 - loss: 0.0979 - acc: 0.96 - ETA: 39:47 - loss: 0.0979 - acc: 0.96 - ETA: 39:45 - loss: 0.0977 - acc: 0.96 - ETA: 39:43 - loss: 0.0976 - acc: 0.96 - ETA: 39:42 - loss: 0.0976 - acc: 0.96 - ETA: 39:41 - loss: 0.0976 - acc: 0.96 - ETA: 39:39 - loss: 0.0977 - acc: 0.96 - ETA: 39:38 - loss: 0.0976 - acc: 0.96 - ETA: 39:36 - loss: 0.0976 - acc: 0.96 - ETA: 39:36 - loss: 0.0975 - acc: 0.96 - ETA: 39:35 - loss: 0.0975 - acc: 0.96 - ETA: 39:33 - loss: 0.0974 - acc: 0.96 - ETA: 39:31 - loss: 0.0975 - acc: 0.96 - ETA: 39:30 - loss: 0.0975 - acc: 0.96 - ETA: 39:28 - loss: 0.0976 - acc: 0.96 - ETA: 39:27 - loss: 0.0977 - acc: 0.96 - ETA: 39:26 - loss: 0.0978 - a

















<keras.callbacks.History at 0x1fb03ae8c50>

In [12]:
model.save_weights("model_Embeddings+GRU.h5")

#### Fscore for validation with different thresholds

In [13]:
pred_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [14]:
f1_final = 0
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))
    if f1>f1_final:
        f1_final = f1
        threshold = thresh
print("F1 score at threshold {0} is {1}".format(threshold, metrics.f1_score(val_y, (pred_val_y>threshold).astype(int))))

F1 score at threshold 0.34 is 0.6480105190944431


# Testing with real dataset

In [None]:
pred_test_y = model.predict([test_X], batch_size=1024, verbose=1)
pred_test_y = (pred_test_y>threshold).astype(int)

In [None]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = pred_test_y
submission_df.to_csv("submission.csv", index=False)