# Notebook Objective:

Objective of the notebook is to look at the different pretrained embeddings provided in the dataset and to see how they are useful in the model building process.

First let us import the necessary modules and read the input data.

In [11]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [20]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print("Train shape : ", train_df.shape)
print("Test shape : ", test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11816342070403021042
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3164969369
locality {
  bus_id: 1
  links {
  }
}
incarnation: 379451362428587691
physical_device_desc: "device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


# Next steps are as follows:

* Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
* Fill up the missing values in the text column with 'na'
* Tokenize the text column and convert them to vector sequences
* Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [22]:
# split to train and val
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state = 2018)

# some config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max numbers of words in a question to use

# fill up missing values

train_X = train_df['question_text'].fillna("_na_").values
val_X = val_df['question_text'].fillna("_na_").values
test_X = test_df['question_text'].fillna("_na_").values

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

#Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [23]:
train_X.shape

(1175509, 100)

In [24]:
EMBEDDING_FILE = "embeddings/glove.840B.300d/glove.840B.300d.txt"
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding="utf8"))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [25]:
model.fit(train_X, train_y, batch_size=512, epochs=5, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/5


 103424/1175509 [=>............................] - ETA: 1:00:05 - loss: 1.3555 - acc: 0.06 - ETA: 32:33 - loss: 1.1541 - acc: 0.0537 - ETA: 23:21 - loss: 0.9963 - acc: 0.22 - ETA: 18:39 - loss: 0.8868 - acc: 0.38 - ETA: 15:52 - loss: 0.7961 - acc: 0.50 - ETA: 13:59 - loss: 0.7230 - acc: 0.57 - ETA: 12:39 - loss: 0.6663 - acc: 0.62 - ETA: 11:39 - loss: 0.6180 - acc: 0.66 - ETA: 10:52 - loss: 0.5835 - acc: 0.69 - ETA: 10:14 - loss: 0.5485 - acc: 0.71 - ETA: 9:43 - loss: 0.5182 - acc: 0.7370 - ETA: 9:17 - loss: 0.4949 - acc: 0.753 - ETA: 8:55 - loss: 0.4772 - acc: 0.767 - ETA: 8:37 - loss: 0.4630 - acc: 0.779 - ETA: 8:21 - loss: 0.4527 - acc: 0.788 - ETA: 8:07 - loss: 0.4391 - acc: 0.798 - ETA: 7:55 - loss: 0.4255 - acc: 0.807 - ETA: 7:43 - loss: 0.4159 - acc: 0.814 - ETA: 7:33 - loss: 0.4053 - acc: 0.821 - ETA: 7:24 - loss: 0.3968 - acc: 0.827 - ETA: 7:15 - loss: 0.3893 - acc: 0.832 - ETA: 7:07 - loss: 0.3826 - acc: 0.837 - ETA: 7:00 - loss: 0.3761 - acc: 0.841 - ETA: 6:53 - loss: 0.3733



















Epoch 2/5


 104448/1175509 [=>............................] - ETA: 6:26 - loss: 0.0862 - acc: 0.976 - ETA: 5:36 - loss: 0.0819 - acc: 0.973 - ETA: 5:18 - loss: 0.0798 - acc: 0.974 - ETA: 5:08 - loss: 0.0853 - acc: 0.969 - ETA: 5:01 - loss: 0.0850 - acc: 0.969 - ETA: 4:59 - loss: 0.0915 - acc: 0.965 - ETA: 5:02 - loss: 0.0919 - acc: 0.964 - ETA: 5:02 - loss: 0.0891 - acc: 0.966 - ETA: 5:02 - loss: 0.0945 - acc: 0.964 - ETA: 5:01 - loss: 0.0913 - acc: 0.966 - ETA: 5:00 - loss: 0.0931 - acc: 0.965 - ETA: 4:59 - loss: 0.0946 - acc: 0.964 - ETA: 4:57 - loss: 0.0942 - acc: 0.964 - ETA: 4:56 - loss: 0.0930 - acc: 0.964 - ETA: 4:55 - loss: 0.0926 - acc: 0.965 - ETA: 4:54 - loss: 0.0914 - acc: 0.965 - ETA: 4:53 - loss: 0.0903 - acc: 0.965 - ETA: 4:52 - loss: 0.0897 - acc: 0.966 - ETA: 4:51 - loss: 0.0916 - acc: 0.964 - ETA: 4:51 - loss: 0.0915 - acc: 0.965 - ETA: 4:51 - loss: 0.0919 - acc: 0.965 - ETA: 4:50 - loss: 0.0931 - acc: 0.964 - ETA: 4:50 - loss: 0.0941 - acc: 0.963 - ETA: 4:50 - loss: 0.0938 - ac



















Epoch 3/5


 104448/1175509 [=>............................] - ETA: 7:05 - loss: 0.0949 - acc: 0.964 - ETA: 6:12 - loss: 0.0813 - acc: 0.970 - ETA: 5:49 - loss: 0.0849 - acc: 0.969 - ETA: 5:36 - loss: 0.0819 - acc: 0.970 - ETA: 5:28 - loss: 0.0805 - acc: 0.969 - ETA: 5:22 - loss: 0.0811 - acc: 0.968 - ETA: 5:18 - loss: 0.0839 - acc: 0.968 - ETA: 5:15 - loss: 0.0833 - acc: 0.967 - ETA: 5:12 - loss: 0.0838 - acc: 0.967 - ETA: 5:10 - loss: 0.0842 - acc: 0.967 - ETA: 5:09 - loss: 0.0855 - acc: 0.967 - ETA: 5:07 - loss: 0.0846 - acc: 0.967 - ETA: 5:05 - loss: 0.0856 - acc: 0.967 - ETA: 5:03 - loss: 0.0848 - acc: 0.967 - ETA: 5:02 - loss: 0.0848 - acc: 0.967 - ETA: 5:01 - loss: 0.0850 - acc: 0.967 - ETA: 5:00 - loss: 0.0840 - acc: 0.967 - ETA: 4:59 - loss: 0.0837 - acc: 0.967 - ETA: 5:00 - loss: 0.0820 - acc: 0.968 - ETA: 5:00 - loss: 0.0815 - acc: 0.968 - ETA: 4:59 - loss: 0.0804 - acc: 0.969 - ETA: 4:59 - loss: 0.0812 - acc: 0.968 - ETA: 4:59 - loss: 0.0807 - acc: 0.968 - ETA: 4:59 - loss: 0.0814 - ac



















Epoch 4/5


 104448/1175509 [=>............................] - ETA: 6:19 - loss: 0.0412 - acc: 0.986 - ETA: 5:34 - loss: 0.0420 - acc: 0.984 - ETA: 5:18 - loss: 0.0536 - acc: 0.979 - ETA: 5:10 - loss: 0.0609 - acc: 0.976 - ETA: 5:04 - loss: 0.0577 - acc: 0.977 - ETA: 5:01 - loss: 0.0591 - acc: 0.978 - ETA: 4:59 - loss: 0.0568 - acc: 0.980 - ETA: 4:57 - loss: 0.0562 - acc: 0.980 - ETA: 4:57 - loss: 0.0543 - acc: 0.980 - ETA: 4:55 - loss: 0.0569 - acc: 0.980 - ETA: 4:53 - loss: 0.0573 - acc: 0.979 - ETA: 4:52 - loss: 0.0570 - acc: 0.979 - ETA: 4:51 - loss: 0.0578 - acc: 0.979 - ETA: 4:50 - loss: 0.0563 - acc: 0.979 - ETA: 4:50 - loss: 0.0562 - acc: 0.979 - ETA: 4:49 - loss: 0.0593 - acc: 0.977 - ETA: 4:49 - loss: 0.0588 - acc: 0.977 - ETA: 4:48 - loss: 0.0593 - acc: 0.977 - ETA: 4:48 - loss: 0.0593 - acc: 0.977 - ETA: 4:48 - loss: 0.0588 - acc: 0.978 - ETA: 4:48 - loss: 0.0598 - acc: 0.977 - ETA: 4:47 - loss: 0.0592 - acc: 0.978 - ETA: 4:47 - loss: 0.0591 - acc: 0.977 - ETA: 4:46 - loss: 0.0593 - ac



















Epoch 5/5


 104448/1175509 [=>............................] - ETA: 6:20 - loss: 0.0424 - acc: 0.990 - ETA: 5:34 - loss: 0.0397 - acc: 0.988 - ETA: 5:19 - loss: 0.0402 - acc: 0.987 - ETA: 5:12 - loss: 0.0428 - acc: 0.986 - ETA: 5:11 - loss: 0.0433 - acc: 0.985 - ETA: 5:09 - loss: 0.0421 - acc: 0.986 - ETA: 5:06 - loss: 0.0423 - acc: 0.985 - ETA: 5:03 - loss: 0.0426 - acc: 0.985 - ETA: 5:02 - loss: 0.0429 - acc: 0.985 - ETA: 5:00 - loss: 0.0421 - acc: 0.985 - ETA: 4:59 - loss: 0.0432 - acc: 0.984 - ETA: 5:00 - loss: 0.0452 - acc: 0.984 - ETA: 5:00 - loss: 0.0452 - acc: 0.983 - ETA: 5:00 - loss: 0.0461 - acc: 0.983 - ETA: 4:58 - loss: 0.0460 - acc: 0.983 - ETA: 4:59 - loss: 0.0458 - acc: 0.982 - ETA: 4:59 - loss: 0.0454 - acc: 0.983 - ETA: 4:59 - loss: 0.0454 - acc: 0.983 - ETA: 4:58 - loss: 0.0458 - acc: 0.983 - ETA: 4:58 - loss: 0.0475 - acc: 0.982 - ETA: 4:57 - loss: 0.0477 - acc: 0.982 - ETA: 4:56 - loss: 0.0471 - acc: 0.982 - ETA: 4:55 - loss: 0.0473 - acc: 0.982 - ETA: 4:54 - loss: 0.0473 - ac





















<keras.callbacks.History at 0x13f720efcc0>

In [26]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [27]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6365679264555669
F1 score at threshold 0.11 is 0.63900372054568
F1 score at threshold 0.12 is 0.6404511984960051
F1 score at threshold 0.13 is 0.6422811363516577
F1 score at threshold 0.14 is 0.6423823451209786
F1 score at threshold 0.15 is 0.6436472988937816
F1 score at threshold 0.16 is 0.6442276422764228
F1 score at threshold 0.17 is 0.645189472534673
F1 score at threshold 0.18 is 0.6460844202300369
F1 score at threshold 0.19 is 0.6468174761587935
F1 score at threshold 0.2 is 0.6463482831897999
F1 score at threshold 0.21 is 0.6470819967358883
F1 score at threshold 0.22 is 0.646905463613693
F1 score at threshold 0.23 is 0.6470319634703197
F1 score at threshold 0.24 is 0.6469473321463443
F1 score at threshold 0.25 is 0.646854764107308
F1 score at threshold 0.26 is 0.6477199930114729
F1 score at threshold 0.27 is 0.6473033905252679
F1 score at threshold 0.28 is 0.6462753418198962
F1 score at threshold 0.29 is 0.6453450424256809
F1 score at threshold 0.3 i

Results seem to be better than the model without pretrained embeddings.

In [75]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [76]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Wiki News FastText Embeddings:

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [77]:
EMBEDDING_FILE = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding='utf8') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [78]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


















Epoch 2/2






















<keras.callbacks.History at 0x130c1003470>

In [79]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [80]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5969949357552422
F1 score at threshold 0.11 is 0.6041033109178909
F1 score at threshold 0.12 is 0.6118893363684118
F1 score at threshold 0.13 is 0.6186009112838381
F1 score at threshold 0.14 is 0.624556616643929
F1 score at threshold 0.15 is 0.6291149175862228
F1 score at threshold 0.16 is 0.6355701548568747
F1 score at threshold 0.17 is 0.6393762183235868
F1 score at threshold 0.18 is 0.6439646428053905
F1 score at threshold 0.19 is 0.6481880509304603
F1 score at threshold 0.2 is 0.6518327207708353
F1 score at threshold 0.21 is 0.6547684882610226
F1 score at threshold 0.22 is 0.6578035859820701
F1 score at threshold 0.23 is 0.6598864223025297
F1 score at threshold 0.24 is 0.6631127835267064
F1 score at threshold 0.25 is 0.664765525982256
F1 score at threshold 0.26 is 0.6670941541092231
F1 score at threshold 0.27 is 0.6690095155709342
F1 score at threshold 0.28 is 0.6707843673134737
F1 score at threshold 0.29 is 0.6711920529801324
F1 score at threshold 0.

In [81]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [83]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Paragram Embeddings:

In this section, we can use the paragram embeddings and build the model and make predictions.

In [84]:
EMBEDDING_FILE = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [85]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2




















Epoch 2/2






















<keras.callbacks.History at 0x13009f2ea20>

In [87]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [88]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6026412571046472
F1 score at threshold 0.11 is 0.6121715312847726
F1 score at threshold 0.12 is 0.6199965041076736
F1 score at threshold 0.13 is 0.6265317944833119
F1 score at threshold 0.14 is 0.6315360217489805
F1 score at threshold 0.15 is 0.6365060351976412
F1 score at threshold 0.16 is 0.641553579784745
F1 score at threshold 0.17 is 0.6463600076031173
F1 score at threshold 0.18 is 0.6498121930078012
F1 score at threshold 0.19 is 0.6534817853305988
F1 score at threshold 0.2 is 0.6576469422142928
F1 score at threshold 0.21 is 0.6608817775994382
F1 score at threshold 0.22 is 0.6632476896516706
F1 score at threshold 0.23 is 0.6647843942505134
F1 score at threshold 0.24 is 0.6658365758754864
F1 score at threshold 0.25 is 0.6684149795447393
F1 score at threshold 0.26 is 0.6707349429556911
F1 score at threshold 0.27 is 0.6719579557033303
F1 score at threshold 0.28 is 0.6739236393176279
F1 score at threshold 0.29 is 0.6747225107988409
F1 score at threshold 0

In [89]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [90]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Observations:

* Overall pretrained embeddings seem to give better results comapred to non-pretrained model.
* The performance of the different pretrained embeddings are almost similar.

# Final Blend:

* Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [91]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y

In [92]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6103994606438564
F1 score at threshold 0.11 is 0.6195530244197084
F1 score at threshold 0.12 is 0.6269484808454425
F1 score at threshold 0.13 is 0.6325169032373619
F1 score at threshold 0.14 is 0.6377959927140255
F1 score at threshold 0.15 is 0.6439720253809459
F1 score at threshold 0.16 is 0.6500518232356545
F1 score at threshold 0.17 is 0.6546305583416916
F1 score at threshold 0.18 is 0.6586924939467311
F1 score at threshold 0.19 is 0.6628358135651491
F1 score at threshold 0.2 is 0.6658041401273885
F1 score at threshold 0.21 is 0.6701649931883547
F1 score at threshold 0.22 is 0.6720179968301038
F1 score at threshold 0.23 is 0.6748371082842072
F1 score at threshold 0.24 is 0.6763446475195823
F1 score at threshold 0.25 is 0.6782324404134149
F1 score at threshold 0.26 is 0.6794243070362473
F1 score at threshold 0.27 is 0.6800839476941292
F1 score at threshold 0.28 is 0.6815539255637055
F1 score at threshold 0.29 is 0.6826110806363137
F1 score at threshold 

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [93]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)