#### Description
In this evaluation a non-Sequentail Bidirectional GRU model is being evaluated firstly with no embeddings, each of the embeddings (glove, paragram and fasttext) and lastly the blend of these embeddings.

All read file variables (like train.csv, embeddings files etc) are set to relative path, which means just drag the competition files in to the program's folder and it reads in them automatically. 
Furthermore, I turned off the support for AMD Radeon GPUs, turn it on if necessary (for quicker computing on Radeon machines).

In [2]:
# support for AMD Radeon GPU - if you run this on AMD Radeon GPU computer, then use it

# import plaidml.keras
# plaidml.keras.install_backend()
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


# keras layers, tokenizer, model, sequential etc.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import os
# time for idle the system after deleting models and embedding to test in one notebook
import time
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
# monitor loading time where it is supported
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

import gc

In [3]:
# train_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/train.csv")
# test_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/test.csv")]
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
## split the data frame to train and val
train_data_frame, value_data_frame = train_test_split(train_data_frame, test_size=0.1, random_state=2018)

## configuration values 
# 300 
embedding_size = 30 # the size of each word vector
# 50000
max_features = 500 # the size of unique words in use - the number of rows in the embedding vector
# 100 
max_length_question = 10 # the size of the number of words in each question

## first fill all missing values up
train_X = train_data_frame["question_text"].fillna("_na_").values
val_X = value_data_frame["question_text"].fillna("_na_").values
test_X = test_data_frame["question_text"].fillna("_na_").values

## tokenize with Keras
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## sentence padding
train_X = pad_sequences(train_X, maxlen=max_length_question)
val_X = pad_sequences(val_X, maxlen=max_length_question)
test_X = pad_sequences(test_X, maxlen=max_length_question)

## save the target values as train_y and val_y
train_y = train_data_frame['target'].values
val_y = value_data_frame['target'].values

In [4]:
model_input = Input(shape=(max_length_question,))
layer = Embedding(max_features, embedding_size)(model_input)
layer = Bidirectional(GRU(64, return_sequences=True))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(16, activation="relu")(layer)
layer = Dropout(0.1)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=model_input, outputs=layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

INFO:plaidml:Opening device "metal_amd_radeon_pro_560.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 30)            15000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 128)           36480     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [5]:
## Train the model 
# Set to 1 epoch because of time consuming training time
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/1


<keras.callbacks.History at 0x1a50242438>

In [6]:
thresholds = []

prediction_noembeddings_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (prediction_noembeddings_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda layer: layer[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.16909653191959353
F1 score at threshold 0.11 is 0.18121509643335304
F1 score at threshold 0.12 is 0.19520216419790742
F1 score at threshold 0.13 is 0.21089543260413457
F1 score at threshold 0.14 is 0.22616293180163416
F1 score at threshold 0.15 is 0.24234003135242985
F1 score at threshold 0.16 is 0.26012457998467314
F1 score at threshold 0.17 is 0.2752037752037752
F1 score at threshold 0.18 is 0.28917236071919966
F1 score at threshold 0.19 is 0.3026044028008665
F1 score at threshold 0.2 is 0.3164851833793813
F1 score at threshold 0.21 is 0.3278069742458117
F1 score at threshold 0.22 is 0.33926956304774974
F1 score at threshold 0.23 is 0.3493951747644189
F1 score at threshold 0.24 is 0.3571047103642254
F1 score at threshold 0.25 is 0.3676195565054769
F1 score at threshold 0.26 is 0.3747925689075971
F1 score at threshold 0.27 is 0.37911970171002446
F1 score at threshold 0.28 is 0.38636363636363635
F1 score at threshold 0.29 is 0.38949525761403175
F1 score a

In [7]:
'''Set predictions as well and save them'''
prediction_noembeddings_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [8]:
del model, model_input, layer
gc.collect()
time.sleep(10)

#### Glove
Use embeddings and rebuild the model again to see the performance

In [9]:
# embedding_file = '/Users/sneakysneak/Downloads/quora_dataset/glove.840B.300d/glove.840B.300d.txt'
embedding_file = 'glove.840B.300d.txt'

def get_coefficient(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefficient(*o.split(" ")) for o in 
                        open(embedding_file))

all_embeddings = np.stack(embeddings_index.values())
mean_embeddings, std_embeddings = all_embeddings.mean(), all_embeddings.std()
embedding_size = all_embeddings.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# EMBEDDING MATRIX
embedding_matrix = np.random.normal(mean_embeddings, std_embeddings, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
model_input = Input(shape=(max_length_question,))
layer = Embedding(max_features, embedding_size)(model_input)
layer = Bidirectional(GRU(64, return_sequences=True))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(16, activation="relu")(layer)
layer = Dropout(0.1)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=model_input, outputs=layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

  if (yield from self.run_code(code, result)):


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 300)           150000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 128)           140160    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [None]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/1

In [None]:
thresholds = []

prediction_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (prediction_glove_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda layer: layer[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

In [None]:
'''Set predictions as well and save them'''
prediction_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embeddings, embedding_matrix, model, 
model_input, layer
gc.collect()
time.sleep(10)

#### FastText Embeddings:
FastText trained on WikiNews corpus and rebuild the model

In [4]:
# embedding_file = '/Users/sneakysneak/Downloads/quora_dataset/wiki-news-300d-1M/wiki-news-300d-1M.vec'
embedding_file = 'wiki-news-300d-1M.vec'

def get_coefficient(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefficient(*o.split(" ")) for o in open(embedding_file) if len(o)>100)

all_embeddings = np.stack(embeddings_index.values())
mean_embeddings,std_embeddings = all_embeddings.mean(), all_embeddings.std()
embedding_size = all_embeddings.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(mean_embeddings, std_embeddings, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

model_input = Input(shape=(max_length_question,))
layer = Embedding(max_features, embedding_size)(model_input)
layer = Bidirectional(GRU(64, return_sequences=True))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(16, activation="relu")(layer)
layer = Dropout(0.1)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=model_input, outputs=layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

  if (yield from self.run_code(code, result)):
INFO:plaidml:Opening device "metal_amd_radeon_pro_560.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 300)           150000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 128)           140160    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [5]:
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/1


<keras.callbacks.History at 0x1a1f936d30>

In [6]:
thresholds = []

prediction_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (prediction_fasttext_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda layer: layer[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.4182891504418289
F1 score at threshold 0.11 is 0.4263967235916854
F1 score at threshold 0.12 is 0.43572329090243606
F1 score at threshold 0.13 is 0.4429638431505687
F1 score at threshold 0.14 is 0.4480522345259628
F1 score at threshold 0.15 is 0.4526850707320423
F1 score at threshold 0.16 is 0.45549067084460904
F1 score at threshold 0.17 is 0.4569666111818271
F1 score at threshold 0.18 is 0.4600848827809216
F1 score at threshold 0.19 is 0.46061990550854054
F1 score at threshold 0.2 is 0.4607858947481589
F1 score at threshold 0.21 is 0.4611310344827586
F1 score at threshold 0.22 is 0.46190449215430807
F1 score at threshold 0.23 is 0.45986078886310905
F1 score at threshold 0.24 is 0.45974412377268664
F1 score at threshold 0.25 is 0.4594693281402142
F1 score at threshold 0.26 is 0.4573667516636607
F1 score at threshold 0.27 is 0.4548456750920869
F1 score at threshold 0.28 is 0.45226652812053514
F1 score at threshold 0.29 is 0.449485903814262
F1 score at thre

In [7]:
'''Set predictions as well and save them'''
prediction_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [10]:
del word_index, embeddings_index, all_embeddings, embedding_matrix, model, 
model_input, layer
gc.collect()
time.sleep(10)

NameError: name 'word_index' is not defined

#### Paragram Embeddings:

In [11]:
# embedding_file = '/Users/sneakysneak/Downloads/quora_dataset/paragram_300_sl999/paragram_300_sl999.txt'
embedding_file = 'paragram_300_sl999.txt'


def get_coefficient(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefficient(*o.split(" ")) for o in open(embedding_file, encoding="utf8", errors='ignore') if len(o)>100)

all_embeddings = np.stack(embeddings_index.values())
mean_embeddings,std_embeddings = all_embeddings.mean(), all_embeddings.std()
embedding_size = all_embeddings.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(mean_embeddings, std_embeddings, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
        
model_input = Input(shape=(max_length_question,))
layer = Embedding(max_features, embedding_size)(model_input)
layer = Bidirectional(GRU(64, return_sequences=True))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dense(16, activation="relu")(layer)
layer = Dropout(0.1)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=model_input, outputs=layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 300)           150000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 128)           140160    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/1

ERROR:plaidml:Caused GPU Timeout Error (IOAF code 2)
ERROR:plaidml:Caused GPU Timeout Error (IOAF code 2)


In [None]:
thresholds = []

prediction_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (prediction_paragram_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda layer: layer[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

In [None]:
prediction_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embeddings, embedding_matrix, model, 
model_input, layer
gc.collect()
time.sleep(10)

#### Observations:
- Overall pretrained embeddings seem to give better results comapred to non-pretrained model.
- The performance of the different pretrained embeddings are almost similar.

#### Final thoughts:
Despite the results of the models with different pre-trained embeddings are similiar, there is a good chance that they might capture different type of information from the data. So let us do a mixture of these three models by averaging their predictions.

In [None]:
pred_val_y = 0.33*prediction_glove_val_y + 0.33*prediction_fasttext_val_y + 0.34*prediction_paragram_val_y 
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))
thresholds.sort(key=lambda layer: layer[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

In [None]:
pred_test_y = 0.33*prediction_glove_test_y + 0.33*prediction_fasttext_test_y + 0.34*prediction_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_data_frame["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission_embeddings.csv", index=False)