# Notebook Objective:

Objective of the notebook is to look at the different pretrained embeddings provided in the dataset and to see how they are useful in the model building process.

First let us import the necessary modules and read the input data.

In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [67]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print("Train shape : ", train_df.shape)
print("Test shape : ", test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5905101638513582087
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3164969369
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15931788729865234787
physical_device_desc: "device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


# Next steps are as follows:

* Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
* Fill up the missing values in the text column with 'na'
* Tokenize the text column and convert them to vector sequences
* Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [68]:
# split to train and val
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state = 2018)

# some config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max numbers of words in a question to use

# fill up missing values

train_X = train_df['question_text'].fillna("_na_").values
val_X = val_df['question_text'].fillna("_na_").values
test_X = test_df['question_text'].fillna("_na_").values

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

#Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [69]:
train_X.shape

(1175509, 100)

# Without Pretrained Embeddings:

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. Please check out the model summary for the details of the layers used.

In [11]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

Train the model using train sample and monitor the metric on the valid sample. This is just a sample model running for 2 epochs. Changing the epochs, batch_size and model parameters might give us a better model.

In [12]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


 103424/1175509 [=>............................] - ETA: 3:24:35 - loss: 0.6872 - acc: 0.74 - ETA: 1:44:43 - loss: 0.6733 - acc: 0.84 - ETA: 1:11:21 - loss: 0.6602 - acc: 0.86 - ETA: 54:38 - loss: 0.6445 - acc: 0.8877 - ETA: 44:34 - loss: 0.6282 - acc: 0.89 - ETA: 37:51 - loss: 0.6120 - acc: 0.90 - ETA: 33:04 - loss: 0.5961 - acc: 0.90 - ETA: 29:28 - loss: 0.5770 - acc: 0.91 - ETA: 26:40 - loss: 0.5587 - acc: 0.91 - ETA: 24:26 - loss: 0.5396 - acc: 0.91 - ETA: 22:36 - loss: 0.5217 - acc: 0.91 - ETA: 21:05 - loss: 0.5030 - acc: 0.92 - ETA: 19:47 - loss: 0.4870 - acc: 0.92 - ETA: 18:41 - loss: 0.4713 - acc: 0.92 - ETA: 17:43 - loss: 0.4540 - acc: 0.92 - ETA: 16:53 - loss: 0.4411 - acc: 0.92 - ETA: 16:08 - loss: 0.4278 - acc: 0.92 - ETA: 15:29 - loss: 0.4179 - acc: 0.92 - ETA: 14:54 - loss: 0.4069 - acc: 0.92 - ETA: 14:22 - loss: 0.3982 - acc: 0.92 - ETA: 13:53 - loss: 0.3907 - acc: 0.92 - ETA: 13:27 - loss: 0.3851 - acc: 0.92 - ETA: 13:03 - loss: 0.3785 - acc: 0.93 - ETA: 12:41 - loss: 0.



















Epoch 2/2


 104448/1175509 [=>............................] - ETA: 6:10 - loss: 0.0945 - acc: 0.959 - ETA: 5:22 - loss: 0.0918 - acc: 0.962 - ETA: 5:07 - loss: 0.1074 - acc: 0.960 - ETA: 4:58 - loss: 0.0988 - acc: 0.964 - ETA: 4:54 - loss: 0.0988 - acc: 0.965 - ETA: 4:50 - loss: 0.1000 - acc: 0.965 - ETA: 4:47 - loss: 0.0989 - acc: 0.964 - ETA: 4:50 - loss: 0.1015 - acc: 0.964 - ETA: 4:51 - loss: 0.0968 - acc: 0.966 - ETA: 4:50 - loss: 0.0964 - acc: 0.964 - ETA: 4:49 - loss: 0.0961 - acc: 0.964 - ETA: 4:48 - loss: 0.0968 - acc: 0.963 - ETA: 4:47 - loss: 0.0969 - acc: 0.962 - ETA: 4:46 - loss: 0.0957 - acc: 0.962 - ETA: 4:44 - loss: 0.0964 - acc: 0.962 - ETA: 4:43 - loss: 0.0971 - acc: 0.962 - ETA: 4:43 - loss: 0.0971 - acc: 0.962 - ETA: 4:42 - loss: 0.0957 - acc: 0.962 - ETA: 4:42 - loss: 0.0952 - acc: 0.963 - ETA: 4:42 - loss: 0.0951 - acc: 0.963 - ETA: 4:41 - loss: 0.0945 - acc: 0.963 - ETA: 4:41 - loss: 0.0946 - acc: 0.963 - ETA: 4:41 - loss: 0.0944 - acc: 0.963 - ETA: 4:40 - loss: 0.0945 - ac





















<keras.callbacks.History at 0x149b9371208>

In [25]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [26]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5601145776883177
F1 score at threshold 0.11 is 0.5685676969960819
F1 score at threshold 0.12 is 0.5777202072538861
F1 score at threshold 0.13 is 0.5850620695343753
F1 score at threshold 0.14 is 0.5920753764616808
F1 score at threshold 0.15 is 0.5988855550792971
F1 score at threshold 0.16 is 0.6054869804161033
F1 score at threshold 0.17 is 0.6114632740812979
F1 score at threshold 0.18 is 0.6162989644304367
F1 score at threshold 0.19 is 0.6213042087465157
F1 score at threshold 0.2 is 0.6263563015858296
F1 score at threshold 0.21 is 0.6286842476378508
F1 score at threshold 0.22 is 0.6322562196168144
F1 score at threshold 0.23 is 0.6359509320969767
F1 score at threshold 0.24 is 0.6397022818529038
F1 score at threshold 0.25 is 0.642089093701997
F1 score at threshold 0.26 is 0.6457403600260742
F1 score at threshold 0.27 is 0.6483003551496702
F1 score at threshold 0.28 is 0.65004105090312
F1 score at threshold 0.29 is 0.6515206654536001
F1 score at threshold 0.3

Now let us get the test set predictions as well and save them

In [27]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



Now that our model building is done, it might be a good idea to clean up some memory before we go to the next step.

In [29]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

In [71]:
EMBEDDING_FILE = "embeddings/glove.840B.300d/glove.840B.300d.txt"
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding="utf8"))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [72]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


 103424/1175509 [=>............................] - ETA: 2:59:30 - loss: 1.0187 - acc: 0.08 - ETA: 1:34:54 - loss: 0.8868 - acc: 0.19 - ETA: 1:06:24 - loss: 0.7702 - acc: 0.42 - ETA: 52:31 - loss: 0.6802 - acc: 0.5513 - ETA: 43:51 - loss: 0.6134 - acc: 0.62 - ETA: 38:03 - loss: 0.5622 - acc: 0.67 - ETA: 34:23 - loss: 0.5237 - acc: 0.71 - ETA: 31:21 - loss: 0.4895 - acc: 0.74 - ETA: 28:51 - loss: 0.4626 - acc: 0.76 - ETA: 26:51 - loss: 0.4366 - acc: 0.78 - ETA: 25:19 - loss: 0.4255 - acc: 0.79 - ETA: 24:09 - loss: 0.4114 - acc: 0.80 - ETA: 23:03 - loss: 0.3969 - acc: 0.81 - ETA: 22:10 - loss: 0.3869 - acc: 0.82 - ETA: 21:24 - loss: 0.3756 - acc: 0.83 - ETA: 20:41 - loss: 0.3679 - acc: 0.83 - ETA: 20:10 - loss: 0.3610 - acc: 0.84 - ETA: 19:35 - loss: 0.3549 - acc: 0.85 - ETA: 19:02 - loss: 0.3475 - acc: 0.85 - ETA: 18:35 - loss: 0.3456 - acc: 0.85 - ETA: 18:06 - loss: 0.3402 - acc: 0.86 - ETA: 17:42 - loss: 0.3360 - acc: 0.86 - ETA: 17:18 - loss: 0.3306 - acc: 0.86 - ETA: 16:58 - loss: 0.

 207872/1175509 [====>.........................] - ETA: 8:09 - loss: 0.1832 - acc: 0.935 - ETA: 8:08 - loss: 0.1830 - acc: 0.936 - ETA: 8:07 - loss: 0.1828 - acc: 0.936 - ETA: 8:07 - loss: 0.1825 - acc: 0.936 - ETA: 8:06 - loss: 0.1825 - acc: 0.936 - ETA: 8:05 - loss: 0.1821 - acc: 0.936 - ETA: 8:05 - loss: 0.1819 - acc: 0.936 - ETA: 8:04 - loss: 0.1817 - acc: 0.936 - ETA: 8:04 - loss: 0.1815 - acc: 0.936 - ETA: 8:03 - loss: 0.1812 - acc: 0.936 - ETA: 8:02 - loss: 0.1810 - acc: 0.936 - ETA: 8:02 - loss: 0.1807 - acc: 0.936 - ETA: 8:01 - loss: 0.1804 - acc: 0.936 - ETA: 8:00 - loss: 0.1802 - acc: 0.936 - ETA: 8:00 - loss: 0.1800 - acc: 0.936 - ETA: 7:59 - loss: 0.1797 - acc: 0.937 - ETA: 7:59 - loss: 0.1795 - acc: 0.937 - ETA: 7:59 - loss: 0.1793 - acc: 0.937 - ETA: 7:58 - loss: 0.1791 - acc: 0.937 - ETA: 7:58 - loss: 0.1789 - acc: 0.937 - ETA: 7:57 - loss: 0.1786 - acc: 0.937 - ETA: 7:56 - loss: 0.1784 - acc: 0.937 - ETA: 7:56 - loss: 0.1782 - acc: 0.937 - ETA: 7:55 - loss: 0.1779 - ac



















Epoch 2/2


 104448/1175509 [=>............................] - ETA: 6:33 - loss: 0.0807 - acc: 0.970 - ETA: 5:32 - loss: 0.0970 - acc: 0.965 - ETA: 5:15 - loss: 0.0883 - acc: 0.965 - ETA: 5:05 - loss: 0.0932 - acc: 0.963 - ETA: 4:58 - loss: 0.0951 - acc: 0.964 - ETA: 4:53 - loss: 0.0974 - acc: 0.962 - ETA: 4:50 - loss: 0.0966 - acc: 0.961 - ETA: 4:49 - loss: 0.0950 - acc: 0.962 - ETA: 4:47 - loss: 0.0946 - acc: 0.963 - ETA: 4:45 - loss: 0.0927 - acc: 0.963 - ETA: 4:45 - loss: 0.0930 - acc: 0.964 - ETA: 4:44 - loss: 0.0934 - acc: 0.963 - ETA: 4:44 - loss: 0.0939 - acc: 0.964 - ETA: 4:45 - loss: 0.0947 - acc: 0.963 - ETA: 4:44 - loss: 0.0955 - acc: 0.963 - ETA: 4:44 - loss: 0.0942 - acc: 0.963 - ETA: 4:44 - loss: 0.0964 - acc: 0.963 - ETA: 4:44 - loss: 0.0981 - acc: 0.962 - ETA: 4:44 - loss: 0.0962 - acc: 0.963 - ETA: 4:43 - loss: 0.0961 - acc: 0.963 - ETA: 4:43 - loss: 0.0962 - acc: 0.963 - ETA: 4:43 - loss: 0.0956 - acc: 0.963 - ETA: 4:43 - loss: 0.0951 - acc: 0.963 - ETA: 4:42 - loss: 0.0945 - ac

 208896/1175509 [====>.........................] - ETA: 4:11 - loss: 0.0957 - acc: 0.962 - ETA: 4:11 - loss: 0.0957 - acc: 0.962 - ETA: 4:11 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0959 - acc: 0.962 - ETA: 4:10 - loss: 0.0958 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0957 - acc: 0.962 - ETA: 4:10 - loss: 0.0955 - acc: 0.962 - ETA: 4:09 - loss: 0.0955 - acc: 0.962 - ETA: 4:09 - loss: 0.0954 - acc: 0.962 - ETA: 4:09 - loss: 0.0954 - acc: 0.962 - ETA: 4:09 - loss: 0.0955 - acc: 0.962 - ETA: 4:09 - loss: 0.0955 - acc: 0.962 - ETA: 4:09 - loss: 0.0955 - acc: 0.962 - ETA: 4:09 - loss: 0.0954 - acc: 0.962 - ETA: 4:09 - loss: 0.0955 - acc: 0.962 - ETA: 4:08 - loss: 0.0955 - acc: 0.962 - ETA: 4:08 - loss: 0.0955 - acc: 0.962 - ETA: 4:08 - loss: 0.0955 - acc: 0.962 - ETA: 4:08 - loss: 0.0956 - ac





















<keras.callbacks.History at 0x130c6535dd8>

In [73]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [74]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6246177031159966
F1 score at threshold 0.11 is 0.6319880688751299
F1 score at threshold 0.12 is 0.6391496410822749
F1 score at threshold 0.13 is 0.6450226561405147
F1 score at threshold 0.14 is 0.6483084930210551
F1 score at threshold 0.15 is 0.6514729872373093
F1 score at threshold 0.16 is 0.6545913659981547
F1 score at threshold 0.17 is 0.6570418385385974
F1 score at threshold 0.18 is 0.6592173222089789
F1 score at threshold 0.19 is 0.6617115307352351
F1 score at threshold 0.2 is 0.6652445527959775
F1 score at threshold 0.21 is 0.6681384125680255
F1 score at threshold 0.22 is 0.6699165067676192
F1 score at threshold 0.23 is 0.6720083791568473
F1 score at threshold 0.24 is 0.6738889182476351
F1 score at threshold 0.25 is 0.6750559880558815
F1 score at threshold 0.26 is 0.6762032804517343
F1 score at threshold 0.27 is 0.6765758923727895
F1 score at threshold 0.28 is 0.6780421110199617
F1 score at threshold 0.29 is 0.6788413793103448
F1 score at threshold 

Results seem to be better than the model without pretrained embeddings.

In [75]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [76]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Wiki News FastText Embeddings:

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [77]:
EMBEDDING_FILE = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding='utf8') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [78]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


 103424/1175509 [=>............................] - ETA: 52:24 - loss: 0.7877 - acc: 0.05 - ETA: 28:34 - loss: 0.7346 - acc: 0.35 - ETA: 20:33 - loss: 0.6874 - acc: 0.54 - ETA: 16:31 - loss: 0.6462 - acc: 0.64 - ETA: 14:06 - loss: 0.6092 - acc: 0.70 - ETA: 12:28 - loss: 0.5768 - acc: 0.74 - ETA: 11:19 - loss: 0.5452 - acc: 0.76 - ETA: 10:28 - loss: 0.5214 - acc: 0.78 - ETA: 9:48 - loss: 0.4965 - acc: 0.8047 - ETA: 9:15 - loss: 0.4735 - acc: 0.817 - ETA: 8:49 - loss: 0.4551 - acc: 0.827 - ETA: 8:27 - loss: 0.4382 - acc: 0.837 - ETA: 8:08 - loss: 0.4206 - acc: 0.845 - ETA: 7:52 - loss: 0.4070 - acc: 0.852 - ETA: 7:38 - loss: 0.3988 - acc: 0.856 - ETA: 7:26 - loss: 0.3925 - acc: 0.860 - ETA: 7:15 - loss: 0.3824 - acc: 0.865 - ETA: 7:05 - loss: 0.3719 - acc: 0.870 - ETA: 6:57 - loss: 0.3645 - acc: 0.874 - ETA: 6:49 - loss: 0.3558 - acc: 0.878 - ETA: 6:42 - loss: 0.3503 - acc: 0.881 - ETA: 6:36 - loss: 0.3435 - acc: 0.884 - ETA: 6:30 - loss: 0.3405 - acc: 0.886 - ETA: 6:24 - loss: 0.3337 - a

 207872/1175509 [====>.........................] - ETA: 4:17 - loss: 0.1729 - acc: 0.939 - ETA: 4:17 - loss: 0.1728 - acc: 0.939 - ETA: 4:17 - loss: 0.1726 - acc: 0.939 - ETA: 4:17 - loss: 0.1723 - acc: 0.939 - ETA: 4:16 - loss: 0.1720 - acc: 0.939 - ETA: 4:16 - loss: 0.1716 - acc: 0.939 - ETA: 4:16 - loss: 0.1715 - acc: 0.939 - ETA: 4:16 - loss: 0.1713 - acc: 0.939 - ETA: 4:16 - loss: 0.1711 - acc: 0.939 - ETA: 4:16 - loss: 0.1709 - acc: 0.939 - ETA: 4:15 - loss: 0.1706 - acc: 0.939 - ETA: 4:15 - loss: 0.1706 - acc: 0.939 - ETA: 4:15 - loss: 0.1704 - acc: 0.939 - ETA: 4:15 - loss: 0.1700 - acc: 0.939 - ETA: 4:15 - loss: 0.1699 - acc: 0.939 - ETA: 4:14 - loss: 0.1697 - acc: 0.939 - ETA: 4:14 - loss: 0.1696 - acc: 0.939 - ETA: 4:14 - loss: 0.1693 - acc: 0.939 - ETA: 4:14 - loss: 0.1691 - acc: 0.939 - ETA: 4:14 - loss: 0.1691 - acc: 0.939 - ETA: 4:14 - loss: 0.1689 - acc: 0.939 - ETA: 4:13 - loss: 0.1687 - acc: 0.939 - ETA: 4:13 - loss: 0.1684 - acc: 0.940 - ETA: 4:13 - loss: 0.1682 - ac



















Epoch 2/2


 104448/1175509 [=>............................] - ETA: 6:06 - loss: 0.0711 - acc: 0.984 - ETA: 5:24 - loss: 0.0899 - acc: 0.967 - ETA: 5:11 - loss: 0.0958 - acc: 0.962 - ETA: 5:06 - loss: 0.0988 - acc: 0.959 - ETA: 5:01 - loss: 0.0932 - acc: 0.961 - ETA: 4:57 - loss: 0.0913 - acc: 0.961 - ETA: 4:55 - loss: 0.0941 - acc: 0.960 - ETA: 4:54 - loss: 0.0914 - acc: 0.962 - ETA: 4:52 - loss: 0.0943 - acc: 0.961 - ETA: 4:50 - loss: 0.0942 - acc: 0.961 - ETA: 4:52 - loss: 0.0982 - acc: 0.961 - ETA: 4:53 - loss: 0.0975 - acc: 0.961 - ETA: 4:52 - loss: 0.0972 - acc: 0.961 - ETA: 4:52 - loss: 0.0960 - acc: 0.961 - ETA: 4:51 - loss: 0.0952 - acc: 0.962 - ETA: 4:50 - loss: 0.0946 - acc: 0.962 - ETA: 4:48 - loss: 0.0958 - acc: 0.962 - ETA: 4:47 - loss: 0.0949 - acc: 0.962 - ETA: 4:47 - loss: 0.0948 - acc: 0.963 - ETA: 4:47 - loss: 0.0944 - acc: 0.963 - ETA: 4:47 - loss: 0.0937 - acc: 0.964 - ETA: 4:46 - loss: 0.0924 - acc: 0.964 - ETA: 4:46 - loss: 0.0915 - acc: 0.964 - ETA: 4:45 - loss: 0.0905 - ac

 208896/1175509 [====>.........................] - ETA: 4:19 - loss: 0.0898 - acc: 0.964 - ETA: 4:18 - loss: 0.0897 - acc: 0.964 - ETA: 4:18 - loss: 0.0897 - acc: 0.964 - ETA: 4:18 - loss: 0.0898 - acc: 0.964 - ETA: 4:18 - loss: 0.0899 - acc: 0.964 - ETA: 4:18 - loss: 0.0899 - acc: 0.964 - ETA: 4:18 - loss: 0.0899 - acc: 0.964 - ETA: 4:18 - loss: 0.0898 - acc: 0.964 - ETA: 4:18 - loss: 0.0899 - acc: 0.964 - ETA: 4:17 - loss: 0.0898 - acc: 0.964 - ETA: 4:17 - loss: 0.0899 - acc: 0.964 - ETA: 4:17 - loss: 0.0899 - acc: 0.964 - ETA: 4:17 - loss: 0.0898 - acc: 0.964 - ETA: 4:17 - loss: 0.0898 - acc: 0.964 - ETA: 4:17 - loss: 0.0899 - acc: 0.964 - ETA: 4:17 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0898 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - acc: 0.964 - ETA: 4:16 - loss: 0.0899 - ac





















<keras.callbacks.History at 0x130c1003470>

In [79]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [80]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5969949357552422
F1 score at threshold 0.11 is 0.6041033109178909
F1 score at threshold 0.12 is 0.6118893363684118
F1 score at threshold 0.13 is 0.6186009112838381
F1 score at threshold 0.14 is 0.624556616643929
F1 score at threshold 0.15 is 0.6291149175862228
F1 score at threshold 0.16 is 0.6355701548568747
F1 score at threshold 0.17 is 0.6393762183235868
F1 score at threshold 0.18 is 0.6439646428053905
F1 score at threshold 0.19 is 0.6481880509304603
F1 score at threshold 0.2 is 0.6518327207708353
F1 score at threshold 0.21 is 0.6547684882610226
F1 score at threshold 0.22 is 0.6578035859820701
F1 score at threshold 0.23 is 0.6598864223025297
F1 score at threshold 0.24 is 0.6631127835267064
F1 score at threshold 0.25 is 0.664765525982256
F1 score at threshold 0.26 is 0.6670941541092231
F1 score at threshold 0.27 is 0.6690095155709342
F1 score at threshold 0.28 is 0.6707843673134737
F1 score at threshold 0.29 is 0.6711920529801324
F1 score at threshold 0.

In [81]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [83]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Paragram Embeddings:

In this section, we can use the paragram embeddings and build the model and make predictions.

In [84]:
EMBEDDING_FILE = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [85]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


 103424/1175509 [=>............................] - ETA: 1:00:00 - loss: 0.4700 - acc: 0.90 - ETA: 32:29 - loss: 0.4108 - acc: 0.9219 - ETA: 23:14 - loss: 0.3660 - acc: 0.92 - ETA: 18:32 - loss: 0.3306 - acc: 0.93 - ETA: 15:42 - loss: 0.3016 - acc: 0.93 - ETA: 13:49 - loss: 0.2971 - acc: 0.93 - ETA: 12:28 - loss: 0.2891 - acc: 0.93 - ETA: 11:28 - loss: 0.2787 - acc: 0.93 - ETA: 10:41 - loss: 0.2758 - acc: 0.93 - ETA: 10:03 - loss: 0.2717 - acc: 0.93 - ETA: 9:33 - loss: 0.2663 - acc: 0.9407 - ETA: 9:08 - loss: 0.2667 - acc: 0.940 - ETA: 8:46 - loss: 0.2654 - acc: 0.940 - ETA: 8:27 - loss: 0.2641 - acc: 0.940 - ETA: 8:11 - loss: 0.2614 - acc: 0.940 - ETA: 7:57 - loss: 0.2575 - acc: 0.940 - ETA: 7:44 - loss: 0.2531 - acc: 0.941 - ETA: 7:34 - loss: 0.2537 - acc: 0.940 - ETA: 7:24 - loss: 0.2502 - acc: 0.941 - ETA: 7:14 - loss: 0.2489 - acc: 0.940 - ETA: 7:06 - loss: 0.2484 - acc: 0.940 - ETA: 6:59 - loss: 0.2472 - acc: 0.940 - ETA: 6:52 - loss: 0.2470 - acc: 0.940 - ETA: 6:46 - loss: 0.2432

 207872/1175509 [====>.........................] - ETA: 4:20 - loss: 0.1565 - acc: 0.945 - ETA: 4:20 - loss: 0.1565 - acc: 0.945 - ETA: 4:20 - loss: 0.1562 - acc: 0.945 - ETA: 4:20 - loss: 0.1560 - acc: 0.945 - ETA: 4:19 - loss: 0.1558 - acc: 0.945 - ETA: 4:19 - loss: 0.1558 - acc: 0.945 - ETA: 4:19 - loss: 0.1555 - acc: 0.945 - ETA: 4:19 - loss: 0.1553 - acc: 0.945 - ETA: 4:19 - loss: 0.1552 - acc: 0.945 - ETA: 4:18 - loss: 0.1549 - acc: 0.945 - ETA: 4:18 - loss: 0.1549 - acc: 0.945 - ETA: 4:18 - loss: 0.1548 - acc: 0.945 - ETA: 4:18 - loss: 0.1546 - acc: 0.945 - ETA: 4:18 - loss: 0.1544 - acc: 0.945 - ETA: 4:17 - loss: 0.1540 - acc: 0.945 - ETA: 4:17 - loss: 0.1538 - acc: 0.945 - ETA: 4:17 - loss: 0.1535 - acc: 0.945 - ETA: 4:17 - loss: 0.1534 - acc: 0.945 - ETA: 4:17 - loss: 0.1534 - acc: 0.945 - ETA: 4:16 - loss: 0.1532 - acc: 0.945 - ETA: 4:16 - loss: 0.1530 - acc: 0.945 - ETA: 4:16 - loss: 0.1527 - acc: 0.945 - ETA: 4:16 - loss: 0.1525 - acc: 0.945 - ETA: 4:16 - loss: 0.1524 - ac



















Epoch 2/2


 104448/1175509 [=>............................] - ETA: 6:33 - loss: 0.0702 - acc: 0.976 - ETA: 5:36 - loss: 0.0811 - acc: 0.970 - ETA: 5:18 - loss: 0.0842 - acc: 0.966 - ETA: 5:09 - loss: 0.0879 - acc: 0.967 - ETA: 5:04 - loss: 0.0825 - acc: 0.969 - ETA: 5:01 - loss: 0.0922 - acc: 0.967 - ETA: 4:59 - loss: 0.0941 - acc: 0.965 - ETA: 4:56 - loss: 0.0924 - acc: 0.966 - ETA: 4:54 - loss: 0.0953 - acc: 0.964 - ETA: 4:52 - loss: 0.0950 - acc: 0.965 - ETA: 4:51 - loss: 0.0959 - acc: 0.964 - ETA: 4:50 - loss: 0.0957 - acc: 0.964 - ETA: 4:49 - loss: 0.0959 - acc: 0.963 - ETA: 4:48 - loss: 0.0967 - acc: 0.963 - ETA: 4:47 - loss: 0.0961 - acc: 0.963 - ETA: 4:47 - loss: 0.0971 - acc: 0.962 - ETA: 4:46 - loss: 0.0960 - acc: 0.963 - ETA: 4:46 - loss: 0.0939 - acc: 0.963 - ETA: 4:45 - loss: 0.0924 - acc: 0.964 - ETA: 4:45 - loss: 0.0930 - acc: 0.964 - ETA: 4:45 - loss: 0.0919 - acc: 0.964 - ETA: 4:45 - loss: 0.0921 - acc: 0.964 - ETA: 4:45 - loss: 0.0916 - acc: 0.964 - ETA: 4:44 - loss: 0.0904 - ac

 208896/1175509 [====>.........................] - ETA: 4:19 - loss: 0.0936 - acc: 0.962 - ETA: 4:19 - loss: 0.0936 - acc: 0.962 - ETA: 4:19 - loss: 0.0934 - acc: 0.962 - ETA: 4:19 - loss: 0.0935 - acc: 0.962 - ETA: 4:19 - loss: 0.0934 - acc: 0.962 - ETA: 4:19 - loss: 0.0935 - acc: 0.962 - ETA: 4:19 - loss: 0.0934 - acc: 0.962 - ETA: 4:19 - loss: 0.0933 - acc: 0.963 - ETA: 4:19 - loss: 0.0933 - acc: 0.962 - ETA: 4:19 - loss: 0.0933 - acc: 0.963 - ETA: 4:19 - loss: 0.0932 - acc: 0.963 - ETA: 4:18 - loss: 0.0932 - acc: 0.962 - ETA: 4:18 - loss: 0.0932 - acc: 0.962 - ETA: 4:18 - loss: 0.0934 - acc: 0.962 - ETA: 4:18 - loss: 0.0934 - acc: 0.962 - ETA: 4:18 - loss: 0.0934 - acc: 0.962 - ETA: 4:18 - loss: 0.0934 - acc: 0.962 - ETA: 4:18 - loss: 0.0932 - acc: 0.963 - ETA: 4:18 - loss: 0.0931 - acc: 0.963 - ETA: 4:17 - loss: 0.0930 - acc: 0.963 - ETA: 4:17 - loss: 0.0930 - acc: 0.963 - ETA: 4:17 - loss: 0.0931 - acc: 0.963 - ETA: 4:17 - loss: 0.0932 - acc: 0.962 - ETA: 4:17 - loss: 0.0932 - ac





















<keras.callbacks.History at 0x13009f2ea20>

In [87]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)



In [88]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6026412571046472
F1 score at threshold 0.11 is 0.6121715312847726
F1 score at threshold 0.12 is 0.6199965041076736
F1 score at threshold 0.13 is 0.6265317944833119
F1 score at threshold 0.14 is 0.6315360217489805
F1 score at threshold 0.15 is 0.6365060351976412
F1 score at threshold 0.16 is 0.641553579784745
F1 score at threshold 0.17 is 0.6463600076031173
F1 score at threshold 0.18 is 0.6498121930078012
F1 score at threshold 0.19 is 0.6534817853305988
F1 score at threshold 0.2 is 0.6576469422142928
F1 score at threshold 0.21 is 0.6608817775994382
F1 score at threshold 0.22 is 0.6632476896516706
F1 score at threshold 0.23 is 0.6647843942505134
F1 score at threshold 0.24 is 0.6658365758754864
F1 score at threshold 0.25 is 0.6684149795447393
F1 score at threshold 0.26 is 0.6707349429556911
F1 score at threshold 0.27 is 0.6719579557033303
F1 score at threshold 0.28 is 0.6739236393176279
F1 score at threshold 0.29 is 0.6747225107988409
F1 score at threshold 0

In [89]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [90]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

# Observations:

* Overall pretrained embeddings seem to give better results comapred to non-pretrained model.
* The performance of the different pretrained embeddings are almost similar.

# Final Blend:

* Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [91]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y

In [92]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6103994606438564
F1 score at threshold 0.11 is 0.6195530244197084
F1 score at threshold 0.12 is 0.6269484808454425
F1 score at threshold 0.13 is 0.6325169032373619
F1 score at threshold 0.14 is 0.6377959927140255
F1 score at threshold 0.15 is 0.6439720253809459
F1 score at threshold 0.16 is 0.6500518232356545
F1 score at threshold 0.17 is 0.6546305583416916
F1 score at threshold 0.18 is 0.6586924939467311
F1 score at threshold 0.19 is 0.6628358135651491
F1 score at threshold 0.2 is 0.6658041401273885
F1 score at threshold 0.21 is 0.6701649931883547
F1 score at threshold 0.22 is 0.6720179968301038
F1 score at threshold 0.23 is 0.6748371082842072
F1 score at threshold 0.24 is 0.6763446475195823
F1 score at threshold 0.25 is 0.6782324404134149
F1 score at threshold 0.26 is 0.6794243070362473
F1 score at threshold 0.27 is 0.6800839476941292
F1 score at threshold 0.28 is 0.6815539255637055
F1 score at threshold 0.29 is 0.6826110806363137
F1 score at threshold 

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [93]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)