# Embedding + Bidirectional GRU

In [1]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


#### Reading data

In [2]:
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
train_df = pd.read_csv(dir_data+'train.csv')
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
test_df = pd.read_csv(dir_data+'test.csv')

#### Tokenizing the words for posterior embedding

In [3]:
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['target'], random_state=123)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

#### Defining a basic model

In [4]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(GRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140160    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)  

#### Training

In [5]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1044897 samples, validate on 261225 samples
Epoch 1/2


 104448/1044897 [=>............................] - ETA: 1:42:17 - loss: 0.6837 - acc: 0.83 - ETA: 1:11:34 - loss: 0.6676 - acc: 0.88 - ETA: 1:01:03 - loss: 0.6523 - acc: 0.90 - ETA: 54:57 - loss: 0.6360 - acc: 0.9180 - ETA: 51:59 - loss: 0.6200 - acc: 0.92 - ETA: 49:33 - loss: 0.6054 - acc: 0.92 - ETA: 48:16 - loss: 0.5883 - acc: 0.92 - ETA: 46:53 - loss: 0.5704 - acc: 0.92 - ETA: 46:03 - loss: 0.5541 - acc: 0.92 - ETA: 45:38 - loss: 0.5363 - acc: 0.92 - ETA: 44:52 - loss: 0.5179 - acc: 0.93 - ETA: 44:25 - loss: 0.5021 - acc: 0.93 - ETA: 43:52 - loss: 0.4866 - acc: 0.93 - ETA: 43:33 - loss: 0.4697 - acc: 0.93 - ETA: 43:06 - loss: 0.4560 - acc: 0.93 - ETA: 42:56 - loss: 0.4447 - acc: 0.93 - ETA: 42:44 - loss: 0.4339 - acc: 0.93 - ETA: 42:37 - loss: 0.4220 - acc: 0.93 - ETA: 42:23 - loss: 0.4112 - acc: 0.93 - ETA: 42:13 - loss: 0.4039 - acc: 0.93 - ETA: 42:05 - loss: 0.3919 - acc: 0.93 - ETA: 41:55 - loss: 0.3893 - acc: 0.93 - ETA: 41:49 - loss: 0.3837 - acc: 0.93 - ETA: 41:45 - loss: 0.

 208896/1044897 [====>.........................] - ETA: 38:48 - loss: 0.1838 - acc: 0.94 - ETA: 38:48 - loss: 0.1836 - acc: 0.94 - ETA: 38:47 - loss: 0.1833 - acc: 0.94 - ETA: 38:46 - loss: 0.1830 - acc: 0.94 - ETA: 38:45 - loss: 0.1826 - acc: 0.94 - ETA: 38:43 - loss: 0.1822 - acc: 0.94 - ETA: 38:42 - loss: 0.1819 - acc: 0.94 - ETA: 38:41 - loss: 0.1818 - acc: 0.94 - ETA: 38:40 - loss: 0.1813 - acc: 0.94 - ETA: 38:39 - loss: 0.1811 - acc: 0.94 - ETA: 38:38 - loss: 0.1809 - acc: 0.94 - ETA: 38:37 - loss: 0.1807 - acc: 0.94 - ETA: 38:36 - loss: 0.1803 - acc: 0.94 - ETA: 38:35 - loss: 0.1800 - acc: 0.94 - ETA: 38:33 - loss: 0.1797 - acc: 0.94 - ETA: 38:32 - loss: 0.1795 - acc: 0.94 - ETA: 38:31 - loss: 0.1790 - acc: 0.94 - ETA: 38:30 - loss: 0.1786 - acc: 0.94 - ETA: 38:29 - loss: 0.1787 - acc: 0.94 - ETA: 38:28 - loss: 0.1785 - acc: 0.94 - ETA: 38:28 - loss: 0.1782 - acc: 0.94 - ETA: 38:27 - loss: 0.1779 - acc: 0.94 - ETA: 38:30 - loss: 0.1777 - acc: 0.94 - ETA: 38:29 - loss: 0.1774 - a

















Epoch 2/2


 104448/1044897 [=>............................] - ETA: 45:03 - loss: 0.0671 - acc: 0.97 - ETA: 43:26 - loss: 0.0759 - acc: 0.97 - ETA: 43:37 - loss: 0.0884 - acc: 0.96 - ETA: 43:09 - loss: 0.0875 - acc: 0.96 - ETA: 43:15 - loss: 0.0912 - acc: 0.96 - ETA: 43:06 - loss: 0.0913 - acc: 0.96 - ETA: 43:01 - loss: 0.0952 - acc: 0.96 - ETA: 43:16 - loss: 0.1001 - acc: 0.96 - ETA: 43:24 - loss: 0.0987 - acc: 0.96 - ETA: 43:23 - loss: 0.0982 - acc: 0.96 - ETA: 43:21 - loss: 0.0986 - acc: 0.96 - ETA: 43:22 - loss: 0.0984 - acc: 0.96 - ETA: 43:18 - loss: 0.0990 - acc: 0.96 - ETA: 43:21 - loss: 0.0990 - acc: 0.96 - ETA: 43:19 - loss: 0.1003 - acc: 0.96 - ETA: 43:21 - loss: 0.0989 - acc: 0.96 - ETA: 43:18 - loss: 0.0985 - acc: 0.96 - ETA: 43:19 - loss: 0.0982 - acc: 0.96 - ETA: 43:23 - loss: 0.0991 - acc: 0.96 - ETA: 43:22 - loss: 0.0971 - acc: 0.96 - ETA: 43:25 - loss: 0.0970 - acc: 0.96 - ETA: 43:25 - loss: 0.0971 - acc: 0.96 - ETA: 43:26 - loss: 0.0983 - acc: 0.96 - ETA: 43:24 - loss: 0.0985 - a

 208896/1044897 [====>.........................] - ETA: 39:25 - loss: 0.0951 - acc: 0.96 - ETA: 39:24 - loss: 0.0950 - acc: 0.96 - ETA: 39:23 - loss: 0.0952 - acc: 0.96 - ETA: 39:21 - loss: 0.0951 - acc: 0.96 - ETA: 39:20 - loss: 0.0953 - acc: 0.96 - ETA: 39:18 - loss: 0.0953 - acc: 0.96 - ETA: 39:17 - loss: 0.0954 - acc: 0.96 - ETA: 39:15 - loss: 0.0955 - acc: 0.96 - ETA: 39:14 - loss: 0.0955 - acc: 0.96 - ETA: 39:12 - loss: 0.0956 - acc: 0.96 - ETA: 39:11 - loss: 0.0956 - acc: 0.96 - ETA: 39:10 - loss: 0.0957 - acc: 0.96 - ETA: 39:09 - loss: 0.0958 - acc: 0.96 - ETA: 39:08 - loss: 0.0959 - acc: 0.96 - ETA: 39:07 - loss: 0.0958 - acc: 0.96 - ETA: 39:06 - loss: 0.0959 - acc: 0.96 - ETA: 39:05 - loss: 0.0958 - acc: 0.96 - ETA: 39:04 - loss: 0.0959 - acc: 0.96 - ETA: 39:02 - loss: 0.0959 - acc: 0.96 - ETA: 39:01 - loss: 0.0958 - acc: 0.96 - ETA: 39:00 - loss: 0.0958 - acc: 0.96 - ETA: 38:59 - loss: 0.0959 - acc: 0.96 - ETA: 38:57 - loss: 0.0959 - acc: 0.96 - ETA: 38:55 - loss: 0.0959 - a

















<keras.callbacks.History at 0x1b6b5205ba8>

#### Fscore for validation with different thresholds

In [7]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)

F1 score at threshold 0.1 is 0.590161102232972
F1 score at threshold 0.11 is 0.5974129353233831
F1 score at threshold 0.12 is 0.6045291043594326
F1 score at threshold 0.13 is 0.6102077999953874
F1 score at threshold 0.14 is 0.6157608823460244
F1 score at threshold 0.15 is 0.621109672469189
F1 score at threshold 0.16 is 0.6256590509666081
F1 score at threshold 0.17 is 0.6286652242035365
F1 score at threshold 0.18 is 0.6325520307132755
F1 score at threshold 0.19 is 0.6356195314705053
F1 score at threshold 0.2 is 0.6375852909005678
F1 score at threshold 0.21 is 0.6398476553201619
F1 score at threshold 0.22 is 0.6420614682592941
F1 score at threshold 0.23 is 0.6437807965589524
F1 score at threshold 0.24 is 0.6450027609055771
F1 score at threshold 0.25 is 0.6457371498922746
F1 score at threshold 0.26 is 0.6458203280177062
F1 score at threshold 0.27 is 0.6471180106367688
F1 score at threshold 0.28 is 0.6469594594594595
F1 score at threshold 0.29 is 0.6469338369828247
F1 score at threshold 0.

In [23]:
threshold = 0.27
print("F1 score at threshold {0} is {1}".format(threshold, metrics.f1_score(val_y, (pred_noemb_val_y>threshold).astype(int))))

F1 score at threshold 0.27 is 0.6471180106367688
