#### Description
In this comparison a Simple Recurrent Neural Network, an LSTM, a GRU and lastly a BiDirectional LSTM model being evaluated, all of them on Glove Embeddings.

All read file variables (like train.csv, embeddings files etc) are set to relative path, which means just drag the competition files in to the program's folder and it reads in them automatically. 
Furthermore, I turned off the support for AMD Radeon GPUs, turn it on if necessary (for quicker computing on Radeon machines).

In [1]:
# support for AMD Radeon GPU - if you run this on AMD Radeon GPU computer, then use it

# import plaidml.keras
# plaidml.keras.install_backend()
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


# keras layers, tokenizer, model, sequential etc.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, SimpleRNN, RNN, LSTM, GRU, Embedding, Dropout, Activation, Flatten, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
# time for idle the system after deleting models and embedding to test in one notebook
import time
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
# monitor loading time where it is supported
from tqdm import tqdm
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import gc

random_seed = 63445
lsize = 128

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [26]:
# train_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/train.csv")
# test_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/test.csv")]
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
## split the data frame to train and val
train_data_frame, value_data_frame = train_test_split(train_data_frame, test_size=0.1, random_state=2018)

## configuration values 
# 300 
embedding_size = 300 # the size of each word vector
# 50000
max_features = 50000 # the size of unique words in use - the number of rows in the embedding vector
# 100 
max_length_question = 100 # the size of the number of words in each question

## first fill all missing values up
train_X = train_data_frame["question_text"].fillna("_na_").values
val_X = value_data_frame["question_text"].fillna("_na_").values
test_X = test_data_frame["question_text"].fillna("_na_").values

## tokenize with Keras
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## sentence padding
train_X = pad_sequences(train_X, maxlen=max_length_question)
val_X = pad_sequences(val_X, maxlen=max_length_question)
test_X = pad_sequences(test_X, maxlen=max_length_question)

## save the target values as train_y and val_y
train_y = train_data_frame['target'].values
val_y = value_data_frame['target'].values

#####  In the for loop with the  help of the  tqdm library (a command line “display interface” which displays time and bar when something is loading). Values splits by space. Word set to 0 values, counting from 0. Coefs is a numpy array conversion of values variable with the float32 data type.

In [4]:
# Load embeddings glove text file
embeddings_index = {}
# glove_emb = open('/Users/sneakysneak/Downloads/quora_dataset/glove.840B.300d/glove.840B.300d.txt')
glove_emb = open('glove.840B.300d.txt')

for line in tqdm(glove_emb):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glove_emb.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [05:08, 7123.52it/s]

Found 2196016 word vectors.





##### In this function creating a numpy array filled with 300 zeros. Text variable’s length split to 30. Embeds variable iterates through on the empty_emb 300 zeros and adds them in 30 minus the number of embeds. So, it’s “filling” them up and returns the numpy array with that.

In [5]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

##### In this new variable val_vects populates this new array with the np.array about text_to_array method from above, with the train_df dummy variable which is the train.csv. Applying tqdm loading screen and the “qhestom text” tab ln the csv file and iterates through on 3000 at once. Val_y creates an array from the “target” tab of the csv, 3000 at once.


In [6]:
val_vects = np.array([text_to_array(train_data_frame) for train_data_frame in tqdm(value_data_frame["question_text"][:3000])])
val_y = np.array(value_data_frame["target"][:3000])

100%|██████████| 3000/3000 [00:00<00:00, 5564.62it/s]


##### Define batch_size variable. Define batch_gen fucntion with the train_df variable, which is the train.csv file. This function creates batches from the train.csv file.

In [7]:
batch_size = 128

def batch_gen(train_data_frame):
    n_batches = math.ceil(len(train_data_frame) / batch_size)
    while True: 
        train_data_frame = train_data_frame.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_data_frame.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_data_frame["target"][i*batch_size:(i+1)*batch_size])

####  LSTM


##### Define an LSTM model in keras. Firstly, define a Sequential model which is a linear stack of layers. In this example, the .add() method will be used. Added to LSTM layers, the first layer (only the first, the rest can do automatic shape inference) must have an input shape argument. In the first layer the input shape is (*,30, 300) sort of (batch_size, 30,300). The output of thhe lstm_1 layer is  (None, 30, 64). So, the first dimension’s output is None because the batch size is unknown in this case.
##### Dense layer, via the argument they support their input shape. The activation is the activation function which is passed to this argument, in this case “sigmoid”.
##### As it was  mentioned above, the input_shape is automatic on the rest of the layers, their output is similar, only the first argument (64) will be the last dimension in their output. In the dense layer it is (batch_size, 1).

In [8]:
model = Sequential()
# input shape (*,30, 300) sort of (batch_size, 30,300) 
model.add(LSTM(64, return_sequences=True, input_shape=(30, 300)))
model.add(LSTM(64))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

INFO:plaidml:Opening device "metal_amd_radeon_pro_560.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 30, 64)            93440     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 126,529
Trainable params: 126,529
Non-trainable params: 0
_________________________________________________________________


In [9]:
train_data_frame.dtypes

qid              object
question_text    object
target            int64
dtype: object

##### The mg is the variable instantiated from the batch_gen function above, the dummy variable in it is the train_df, which is the train.csv dataset. 
##### The .fit_generator inbuilt keras function, which trains the model on the dataset batch-by-batch. The mg comes first, after the number of epochs, and the step_per_epoch, which is the total number of steps in batches of samples before declaring one epoch is finished and between the next epoch starting.
##### The validation_data is an immutable sequence , a tuple, val_vects, val_y, so the vector texts and the target values, 0 or 1.

In [10]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


INFO:plaidml:Analyzing Ops: 2593 of 5988 operations complete
INFO:plaidml:Analyzing Ops: 5582 of 5988 operations complete




INFO:plaidml:Analyzing Ops: 1875 of 2569 operations complete




<keras.callbacks.History at 0x1ab1d2d518>

##### Create a dictionary variable as thresholds. Using the inbuilt keras .predict function with val_vects, defined batch size and verbose is 1. The for loop iterates through a numpy array trying to determine the best f1_score and its threshold value. In the last 3 lines, prints out the best threshold value, which is 0.33 in this case.

In [11]:
thresholds = []

pred_lstm_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_lstm_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)



INFO:plaidml:Analyzing Ops: 1730 of 2535 operations complete


F1 score at threshold 0.1 is 0.5185185185185185
F1 score at threshold 0.11 is 0.5280728376327769
F1 score at threshold 0.12 is 0.5318818040435459
F1 score at threshold 0.13 is 0.5352564102564102
F1 score at threshold 0.14 is 0.5392156862745099
F1 score at threshold 0.15 is 0.5509181969949917
F1 score at threshold 0.16 is 0.559726962457338
F1 score at threshold 0.17 is 0.5674255691768826
F1 score at threshold 0.18 is 0.5796064400715563
F1 score at threshold 0.19 is 0.5843920145190562
F1 score at threshold 0.2 is 0.5897435897435898
F1 score at threshold 0.21 is 0.5921787709497207
F1 score at threshold 0.22 is 0.6011342155009453
F1 score at threshold 0.23 is 0.6080305927342257
F1 score at threshold 0.24 is 0.6061776061776062
F1 score at threshold 0.25 is 0.6093749999999999
F1 score at threshold 0.26 is 0.6086956521739131
F1 score at threshold 0.27 is 0.6067864271457085
F1 score at threshold 0.28 is 0.6116700201207245
F1 score at threshold 0.29 is 0.6141414141414141
F1 score at threshold 0

##### In here, the model is deleted and gc is the garbage collector collects the leftover of the remaining model, and the program idle for 10 seconds.

In [12]:
del model
gc.collect()
time.sleep(10)

#### Simple rnn

##### This model is a simple recurrent nn. It has only 1 layer, the input shape is the same. However, return_sequences is set to False, otherwise cannot use the fit_generator batch generating method. SimpleRNN is a simpler model, it is kind of a base model of recurrent neural networks. The LSTM’s 130-140 seconds average training time per epoch here reduces to 22-26 seconds, but it’s reflecting on its f1 score.

In [13]:
model = Sequential()
model.add(SimpleRNN(64, return_sequences=False, input_shape=(30, 300)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 64)                23360     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 23,425
Trainable params: 23,425
Non-trainable params: 0
_________________________________________________________________


In [14]:
val_vects.shape, val_y.shape

((3000, 30, 300), (3000,))

In [15]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


<keras.callbacks.History at 0x1ab2515a58>

In [16]:
thresholds = []

pred_simple_rnn_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_simple_rnn_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.5565529622980251
F1 score at threshold 0.11 is 0.5595667870036101
F1 score at threshold 0.12 is 0.5636363636363637
F1 score at threshold 0.13 is 0.562043795620438
F1 score at threshold 0.14 is 0.5567765567765568
F1 score at threshold 0.15 is 0.5588235294117647
F1 score at threshold 0.16 is 0.5598526703499078
F1 score at threshold 0.17 is 0.5598526703499078
F1 score at threshold 0.18 is 0.5619223659889095
F1 score at threshold 0.19 is 0.5629629629629629
F1 score at threshold 0.2 is 0.5602968460111317
F1 score at threshold 0.21 is 0.5613382899628253
F1 score at threshold 0.22 is 0.5634328358208955
F1 score at threshold 0.23 is 0.5666041275797373
F1 score at threshold 0.24 is 0.569811320754717
F1 score at threshold 0.25 is 0.5719696969696969
F1 score at threshold 0.26 is 0.5725190839694657
F1 score at threshold 0.27 is 0.5758157389635316
F1 score at threshold 0.28 is 0.5780346820809249
F1 score at threshold 0.29 is 0.5802707930367504
F1 score at threshold 0.

In [17]:
del model
gc.collect()
time.sleep(10)

#### GRU

##### The last model is the gated recurrent unit GRU.  Two layers were defined, the parameters are different from LSTM or SimplRNN. The difference in a nutshell between LSTM and GRU is GRU has two gates (update and reset) while LSTM has three (input, output, forget). Furthermore, GRU slightly faster 113-133 seconds average/epoch training time, whereas LSTM’s 130-140 seconds. The best f1 score of GRU is 0.63 while LSTM 0.70.

In [18]:
model = Sequential()
model.add(GRU(32, dropout=0.5, recurrent_dropout=0.2, return_sequences=True,  input_shape=(30, 300)))
model.add(GRU(32, dropout=0.5, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.build()
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 30, 32)            31968     
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                6240      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 38,241
Trainable params: 38,241
Non-trainable params: 0
_________________________________________________________________


In [19]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


INFO:plaidml:Analyzing Ops: 2716 of 5320 operations complete




INFO:plaidml:Analyzing Ops: 2114 of 2116 operations complete




<keras.callbacks.History at 0x1ab2500da0>

In [20]:
thresholds = []

pred_gru_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_gru_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.5385878489326764
F1 score at threshold 0.11 is 0.5469798657718121
F1 score at threshold 0.12 is 0.5506756756756758
F1 score at threshold 0.13 is 0.552901023890785
F1 score at threshold 0.14 is 0.5542168674698795
F1 score at threshold 0.15 is 0.562390158172232
F1 score at threshold 0.16 is 0.5668449197860963
F1 score at threshold 0.17 is 0.5771324863883848
F1 score at threshold 0.18 is 0.5824175824175823
F1 score at threshold 0.19 is 0.5878003696857671
F1 score at threshold 0.2 is 0.5880149812734082
F1 score at threshold 0.21 is 0.5913370998116761
F1 score at threshold 0.22 is 0.5900383141762452
F1 score at threshold 0.23 is 0.5941747572815533
F1 score at threshold 0.24 is 0.5984251968503937
F1 score at threshold 0.25 is 0.5976095617529881
F1 score at threshold 0.26 is 0.6008064516129031
F1 score at threshold 0.27 is 0.595482546201232
F1 score at threshold 0.28 is 0.5933609958506224
F1 score at threshold 0.29 is 0.5953878406708596
F1 score at threshold 0.3

In [21]:
del model
gc.collect()
time.sleep(1)

#### Bidirectional LSTM
- Sequential means 1 in 1 out
- LSTM(64 - spits out in 64 dimension
- takes in the input shape's size
- the second LSTM layer takes in 64 dimenson
- spits out in 64
- dense(8 takes in 64D
- spits out 8


In [22]:
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5),
                      input_shape=(30,300)))  # return_sequences=True , stateful=True
model.add(Bidirectional(LSTM(64)))
model.add(Dense(8))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 30, 128)           186880    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 1032      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 286,737
Trainable params: 286,737
Non-trainable params: 0
_________________________________________________________________


In [23]:
mg = batch_gen(train_data_frame)
model.fit_generator(mg, epochs=1,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/1


INFO:plaidml:Analyzing Ops: 2432 of 12343 operations complete
INFO:plaidml:Analyzing Ops: 6000 of 12343 operations complete
INFO:plaidml:Analyzing Ops: 9063 of 12343 operations complete
INFO:plaidml:Analyzing Ops: 11904 of 12343 operations complete




INFO:plaidml:Analyzing Ops: 3032 of 5120 operations complete
INFO:plaidml:Analyzing Ops: 3767 of 5120 operations complete




<keras.callbacks.History at 0x1aa88d1390>

In [24]:
thresholds = []

pred_simple_rnn_val_y = model.predict([val_vects], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_simple_rnn_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

INFO:plaidml:Analyzing Ops: 2182 of 5086 operations complete




INFO:plaidml:Analyzing Ops: 3220 of 5086 operations complete


F1 score at threshold 0.1 is 0.5537974683544303
F1 score at threshold 0.11 is 0.5584415584415585
F1 score at threshold 0.12 is 0.5681063122923589
F1 score at threshold 0.13 is 0.5816326530612245
F1 score at threshold 0.14 is 0.5808695652173913
F1 score at threshold 0.15 is 0.5945945945945946
F1 score at threshold 0.16 is 0.5947955390334573
F1 score at threshold 0.17 is 0.5996204933586337
F1 score at threshold 0.18 is 0.5996131528046422
F1 score at threshold 0.19 is 0.6123260437375745
F1 score at threshold 0.2 is 0.6178861788617886
F1 score at threshold 0.21 is 0.6239669421487603
F1 score at threshold 0.22 is 0.628691983122363
F1 score at threshold 0.23 is 0.6322580645161291
F1 score at threshold 0.24 is 0.6301969365426697
F1 score at threshold 0.25 is 0.6283185840707964
F1 score at threshold 0.26 is 0.6202247191011236
F1 score at threshold 0.27 is 0.6129032258064516
F1 score at threshold 0.28 is 0.6179245283018868
F1 score at threshold 0.29 is 0.6161137440758294
F1 score at threshold 0