# Embedding + Bidirectional GRU

In [None]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

#### Reading data

In [2]:
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
train_df = pd.read_csv(dir_data+'train.csv')
dir_data = 'D:/Data_Master/Natural Language Processing/Project1/quora/'
test_df = pd.read_csv(dir_data+'test.csv')

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


#### Tokenizing the words for posterior embedding

In [3]:
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['target'], random_state=123)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

#### Defining a basic model

In [4]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(GRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140160    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)  

#### Training

In [6]:
## Train the model 
model.fit(train_X, train_y, batch_size=1024, epochs=2, validation_data=(val_X, val_y))

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
  39936/1044897 [>.............................] - ETA: 45:57 - loss: 0.1186 - acc: 0.96 - ETA: 44:13 - loss: 0.1412 - acc: 0.94 - ETA: 44:07 - loss: 0.1507 - acc: 0.94 - ETA: 43:55 - loss: 0.1527 - acc: 0.94 - ETA: 43:39 - loss: 0.1480 - acc: 0.94 - ETA: 43:33 - loss: 0.1465 - acc: 0.94 - ETA: 43:21 - loss: 0.1466 - acc: 0.94 - ETA: 43:28 - loss: 0.1432 - acc: 0.94 - ETA: 43:29 - loss: 0.1427 - acc: 0.94 - ETA: 43:42 - loss: 0.1435 - acc: 0.94 - ETA: 43:42 - loss: 0.1424 - acc: 0.94 - ETA: 43:46 - loss: 0.1410 - acc: 0.94 - ETA: 43:48 - loss: 0.1396 - acc: 0.94 - ETA: 43:46 - loss: 0.1398 - acc: 0.94 - ETA: 43:50 - loss: 0.1404 - acc: 0.94 - ETA: 43:48 - loss: 0.1415 - acc: 0.94 - ETA: 43:49 - loss: 0.1406 - acc: 0.94 - ETA: 43:44 - loss: 0.1389 - acc: 0.94 - ETA: 43:44 - loss: 0.1374 - acc: 0.94 - ETA: 43:46 - loss: 0.1363 - acc: 0.94 - ETA: 43:46 - loss: 0.1367 - acc: 0.94 - ETA: 43:49 - loss: 0.1359 - acc: 0.94 - ETA: 4

KeyboardInterrupt: 

#### Fscore for validation with different thresholds

In [7]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))



KeyboardInterrupt: 