#### Description
In this evaluation a Sequentail Bidirectional GRU model is being evaluated, compared to its non-Sequential pair (all_embeddings_with_non_sequential_BiGRU_model.pynb). Glove and FastText Embeddings were used.

All read file variables (like train.csv, embeddings files etc) are set to relative path, which means just drag the competition files in to the program's folder and it reads in them automatically. 
Furthermore, I turned off the support for AMD Radeon GPUs, turn it on if necessary (for quicker computing on Radeon machines).

In [1]:
# support for AMD Radeon GPU - if you run this on AMD Radeon GPU computer, then use it

# import plaidml.keras
# plaidml.keras.install_backend()
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


# keras layers, tokenizer, model, sequential etc.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, SimpleRNN, RNN, LSTM, GRU, Embedding, Dropout, Activation, Flatten, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
# time for idle the system after deleting models and embedding to test in one notebook
import time
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
# monitor loading time where it is supported
from tqdm import tqdm
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import gc

random_seed = 63445
lsize = 128

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [21]:
# train_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/train.csv")
# test_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/test.csv")]
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
## split the data frame to train and val
train_data_frame, value_data_frame = train_test_split(train_data_frame, test_size=0.1, random_state=2018)

## configuration values 
# 300 
embedding_size = 300 # the size of each word vector
# 50000
max_features = 50000 # the size of unique words in use - the number of rows in the embedding vector
# 100 
max_length_question = 100 # the size of the number of words in each question

## first fill all missing values up
train_X = train_data_frame["question_text"].fillna("_na_").values
val_X = value_data_frame["question_text"].fillna("_na_").values
test_X = test_data_frame["question_text"].fillna("_na_").values

## tokenize with Keras
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## sentence padding
train_X = pad_sequences(train_X, maxlen=max_length_question)
val_X = pad_sequences(val_X, maxlen=max_length_question)
test_X = pad_sequences(test_X, maxlen=max_length_question)

## save the target values as train_y and val_y
train_y = train_data_frame['target'].values
val_y = value_data_frame['target'].values

#### Glove embeddings
#####  In the for loop with the  help of the  tqdm library (a command line “display interface” which displays time and bar when something is loading). Values splits by space. Word set to 0 values, counting from 0. Coefs is a numpy array conversion of values variable with the float32 data type.

In [4]:
# Load embeddings glove text file
embeddings_index = {}
# glove_emb = open('/Users/sneakysneak/Downloads/quora_dataset/glove.840B.300d/glove.840B.300d.txt')
glove_emb = open('glove.840B.300d.txt')


for line in tqdm(glove_emb):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glove_emb.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [04:29, 8159.25it/s] 

Found 2196016 word vectors.





##### In this function creating a numpy array filled with 300 zeros. Text variable’s length split to 30. Embeds variable iterates through on the empty_emb 300 zeros and adds them in 30 minus the number of embeds. So, it’s “filling” them up and returns the numpy array with that.

In [5]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

##### In this new variable val_vects populates this new array with the np.array about text_to_array method from above, with the train_df dummy variable which is the train.csv. Applying tqdm loading screen and the “qhestom text” tab ln the csv file and iterates through on 3000 at once. Val_y creates an array from the “target” tab of the csv, 3000 at once.

In [6]:
val_vects = np.array([text_to_array(train_data_frame) for train_data_frame in tqdm(value_data_frame["question_text"][:3000])])
val_y = np.array(value_data_frame["target"][:3000])

100%|██████████| 3000/3000 [00:00<00:00, 3586.68it/s]


##### Define batch_size variable. Define batch_gen fucntion with the train_df variable, which is the train.csv file. This function creates batches from the train.csv file.

In [7]:
batch_size = 128

def batch_gen(train_data_frame):
    n_batches = math.ceil(len(train_data_frame) / batch_size)
    while True: 
        train_data_frame = train_data_frame.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_data_frame.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_data_frame["target"][i*batch_size:(i+1)*batch_size])

#### Bidirectional GRU model

In [8]:
model = Sequential()
model.add(Bidirectional(GRU(64, return_sequences=True, recurrent_dropout=0.5),
                      input_shape=(30,300))) 
model.add(Bidirectional(GRU(64)))
model.add(Dense(8))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

INFO:plaidml:Opening device "metal_amd_radeon_pro_560.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 30, 128)           140160    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               74112     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 1032      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 215,313
Trainable params: 215,313
Non-trainable params: 0
_________________________________________________________________


##### The mg is the variable instantiated from the batch_gen function above, the dummy variable in it is the train_df, which is the train.csv dataset. 
##### The .fit_generator inbuilt keras function, which trains the model on the dataset batch-by-batch. The mg comes first, after the number of epochs, and the step_per_epoch, which is the total number of steps in batches of samples before declaring one epoch is finished and between the next epoch starting.
##### The validation_data is an immutable sequence , a tuple, val_vects, val_y, so the vector texts and the target values, 0 or 1.

In [9]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


INFO:plaidml:Analyzing Ops: 3263 of 9555 operations complete
INFO:plaidml:Analyzing Ops: 7601 of 9555 operations complete




INFO:plaidml:Analyzing Ops: 3161 of 3904 operations complete




<keras.callbacks.History at 0x1a44008f28>

##### Create a dictionary variable as thresholds. Using the inbuilt keras .predict function with val_vects, defined batch size and verbose is 1. The for loop iterates through a numpy array trying to determine the best f1_score and its threshold value. In the last 3 lines, prints out the best threshold value, which is 0.33 in this case.

In [10]:
thresholds = []

pred_glove_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_glove_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

INFO:plaidml:Analyzing Ops: 2469 of 3870 operations complete




INFO:plaidml:Analyzing Ops: 3355 of 3870 operations complete


F1 score at threshold 0.1 is 0.5119047619047619
F1 score at threshold 0.11 is 0.5198776758409785
F1 score at threshold 0.12 is 0.524031007751938
F1 score at threshold 0.13 is 0.5399361022364217
F1 score at threshold 0.14 is 0.5463414634146341
F1 score at threshold 0.15 is 0.5535420098846787
F1 score at threshold 0.16 is 0.5536912751677852
F1 score at threshold 0.17 is 0.5559322033898305
F1 score at threshold 0.18 is 0.5626072041166381
F1 score at threshold 0.19 is 0.5709281961471103
F1 score at threshold 0.2 is 0.5759717314487632
F1 score at threshold 0.21 is 0.5785714285714286
F1 score at threshold 0.22 is 0.5858951175406872
F1 score at threshold 0.23 is 0.5919117647058824
F1 score at threshold 0.24 is 0.5955056179775281
F1 score at threshold 0.25 is 0.6011342155009453
F1 score at threshold 0.26 is 0.6068702290076337
F1 score at threshold 0.27 is 0.6150870406189555
F1 score at threshold 0.28 is 0.6186770428015563
F1 score at threshold 0.29 is 0.615686274509804
F1 score at threshold 0.

##### In here, the model is deleted and gc is the garbage collector collects the leftover of the remaining model, and the program idle for 10 seconds.

In [11]:
del model, embeddings_index#, embedding_matrix, inp, all_embs
gc.collect()
time.sleep(10)

#### FastText

In [12]:
# Load embeddings glove text file
embeddings_index = {}
# fasttext_emb = open('/Users/sneakysneak/Downloads/quora_dataset/wiki-news-300d-1M/wiki-news-300d-1M.vec')
fasttext_emb = open('wiki-news-300d-1M.vec')


for line in tqdm(fasttext_emb):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
fasttext_emb.close()

print('Found %s word vectors.' % len(embeddings_index))

999995it [01:53, 8828.33it/s]

Found 999995 word vectors.





In [13]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

In [14]:
val_vects = np.array([text_to_array(train_data_frame) for train_data_frame in tqdm(value_data_frame["question_text"][:3000])])
val_y = np.array(value_data_frame["target"][:3000])

100%|██████████| 3000/3000 [00:00<00:00, 9162.23it/s]


In [15]:
batch_size = 128

def batch_gen(train_data_frame):
    n_batches = math.ceil(len(train_data_frame) / batch_size)
    while True: 
        train_data_frame = train_data_frame.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_data_frame.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_data_frame["target"][i*batch_size:(i+1)*batch_size])

In [16]:
model = Sequential()
model.add(Bidirectional(GRU(64, return_sequences=True, recurrent_dropout=0.5),
                      input_shape=(30,300))) 
model.add(Bidirectional(GRU(64)))
model.add(Dense(8))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 30, 128)           140160    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               74112     
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 1032      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 215,313
Trainable params: 215,313
Non-trainable params: 0
_________________________________________________________________


In [17]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


<keras.callbacks.History at 0x1ab0135128>

In [18]:
thresholds = []

pred_fasttext_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_fasttext_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6156787762906311
F1 score at threshold 0.11 is 0.6186770428015563
F1 score at threshold 0.12 is 0.6239999999999999
F1 score at threshold 0.13 is 0.6262626262626263
F1 score at threshold 0.14 is 0.6335403726708075
F1 score at threshold 0.15 is 0.6276150627615062
F1 score at threshold 0.16 is 0.6367521367521368
F1 score at threshold 0.17 is 0.6336206896551724
F1 score at threshold 0.18 is 0.6373626373626374
F1 score at threshold 0.19 is 0.6410835214446954
F1 score at threshold 0.2 is 0.6376146788990826
F1 score at threshold 0.21 is 0.6448598130841121
F1 score at threshold 0.22 is 0.6365795724465558
F1 score at threshold 0.23 is 0.6426858513189448
F1 score at threshold 0.24 is 0.6390243902439023
F1 score at threshold 0.25 is 0.6403940886699507
F1 score at threshold 0.26 is 0.63681592039801
F1 score at threshold 0.27 is 0.6397984886649875
F1 score at threshold 0.28 is 0.6288659793814433
F1 score at threshold 0.29 is 0.6337662337662339
F1 score at threshold 0.

In [19]:
del model, embeddings_index#, embedding_matrix, inp, all_embs
gc.collect()
time.sleep(10)