In [8]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.compat.v1.keras.layers import CuDNNGRU

In [10]:
train_df = pd.read_csv("/content/drive/MyDrive/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Next steps are as follows:

Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
Fill up the missing values in the text column with 'na'
Tokenize the text column and convert them to vector sequences
Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

Without Pretrained Embeddings:

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. Please check out the model summary for the details of the layers used.

In [12]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17    

Train the model using train sample and monitor the metric on the valid sample. This is just a sample model running for 2 epochs. Changing the epochs, batch_size and model parameters might give us a better model.

In [None]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f0f9171a8d0>

Now let us get the validation sample predictions and also get the best threshold for F1 score.

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.0
F1 score at threshold 0.11 is 0.0
F1 score at threshold 0.12 is 0.0
F1 score at threshold 0.13 is 0.0
F1 score at threshold 0.14 is 0.0
F1 score at threshold 0.15 is 0.0
F1 score at threshold 0.16 is 0.0
F1 score at threshold 0.17 is 0.0
F1 score at threshold 0.18 is 0.0
F1 score at threshold 0.19 is 0.0
F1 score at threshold 0.2 is 0.0
F1 score at threshold 0.21 is 0.0
F1 score at threshold 0.22 is 0.0
F1 score at threshold 0.23 is 0.0
F1 score at threshold 0.24 is 0.0
F1 score at threshold 0.25 is 0.0
F1 score at threshold 0.26 is 0.0
F1 score at threshold 0.27 is 0.0
F1 score at threshold 0.28 is 0.0
F1 score at threshold 0.29 is 0.0
F1 score at threshold 0.3 is 0.0
F1 score at threshold 0.31 is 0.0
F1 score at threshold 0.32 is 0.0
F1 score at threshold 0.33 is 0.0
F1 score at threshold 0.34 is 0.0
F1 score at threshold 0.35 is 0.0
F1 score at threshold 0.36 is 0.0
F1 score at threshold 0.37 is 0.0
F1 score at threshold 0.38 is 0.0
F1 score at thres

Now let us get the test set predictions as well and save them

In [None]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



Now that our model building is done, it might be a good idea to clean up some memory before we go to the next step.

In [None]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

So we got some baseline GRU model without pre-trained embeddings. Now let us use the provided embeddings and rebuild the model again to see the performance.

We have four different types of embeddings.

GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
glove.840B.300d - https://nlp.stanford.edu/projects/glove/
paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html

A very good explanation for different types of embeddings are given in this kernel. Please refer the same for more details..

Glove Embeddings:

In this section, let us use the Glove embeddings and rebuild the GRU model.

In [6]:
!unzip drive/My\ Drive/embeddings.zip

Archive:  drive/My Drive/embeddings.zip
   creating: GoogleNews-vectors-negative300/
   creating: glove.840B.300d/
   creating: paragram_300_sl999/
   creating: wiki-news-300d-1M/
  inflating: glove.840B.300d/glove.840B.300d.txt  
  inflating: GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: paragram_300_sl999/README.txt  
  inflating: paragram_300_sl999/paragram_300_sl999.txt  


In [13]:
EMBEDDING_FILE = '/content/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8"))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  if self.run_code(code, result):


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17  

In [14]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f45bbc9be90>

In [15]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5734113060428849
F1 score at threshold 0.11 is 0.5828580544809158
F1 score at threshold 0.12 is 0.591649031417874
F1 score at threshold 0.13 is 0.5986833933672836
F1 score at threshold 0.14 is 0.6046923044522134
F1 score at threshold 0.15 is 0.6108922760746703
F1 score at threshold 0.16 is 0.6168752172401808
F1 score at threshold 0.17 is 0.6231488011283497
F1 score at threshold 0.18 is 0.6286938666190519
F1 score at threshold 0.19 is 0.6328358208955225
F1 score at threshold 0.2 is 0.6361972733095436
F1 score at threshold 0.21 is 0.6401518659135105
F1 score at threshold 0.22 is 0.6440376422117141
F1 score at threshold 0.23 is 0.6465639474430476
F1 score at threshold 0.24 is 0.6501672240802676
F1 score at threshold 0.25 is 0.6521969806588531
F1 score at threshold 0.26 is 0.6550163072579469
F1 score at threshold 0.27 is 0.6581108324728328
F1 score at threshold 0.28 is 0.6601527626227556
F1 score at threshold 0.29 is 0.6620296236989591
F1 score at threshold 0

In [16]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [17]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Wiki News FastText Embeddings:

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [18]:
EMBEDDING_FILE = '/content/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8") if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  if self.run_code(code, result):


In [19]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f45bea58d50>

In [20]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5978356983451935
F1 score at threshold 0.11 is 0.6062325381474318
F1 score at threshold 0.12 is 0.6133765494284087
F1 score at threshold 0.13 is 0.6208678606433218
F1 score at threshold 0.14 is 0.6255035987506223
F1 score at threshold 0.15 is 0.6294206687766616
F1 score at threshold 0.16 is 0.6336928226106359
F1 score at threshold 0.17 is 0.6379992464204974
F1 score at threshold 0.18 is 0.6428605482717521
F1 score at threshold 0.19 is 0.6458805926926975
F1 score at threshold 0.2 is 0.6480839638760069
F1 score at threshold 0.21 is 0.6514381567911589
F1 score at threshold 0.22 is 0.6537158796573022
F1 score at threshold 0.23 is 0.6555678503168695
F1 score at threshold 0.24 is 0.6571834992887624
F1 score at threshold 0.25 is 0.6598318638507279
F1 score at threshold 0.26 is 0.6621188630490956
F1 score at threshold 0.27 is 0.6640237859266601
F1 score at threshold 0.28 is 0.6659301346801347
F1 score at threshold 0.29 is 0.6682942362806495
F1 score at threshold 

In [21]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [22]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Paragram Embeddings:

In this section, we can use the paragram embeddings and build the model and make predictions.

In [23]:
EMBEDDING_FILE = '/content/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  if self.run_code(code, result):


In [24]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f459cb9bc90>

In [25]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6025027294868565
F1 score at threshold 0.11 is 0.6107342206955774
F1 score at threshold 0.12 is 0.6178605790256275
F1 score at threshold 0.13 is 0.6238654564869194
F1 score at threshold 0.14 is 0.6290001807991321
F1 score at threshold 0.15 is 0.6334862385321101
F1 score at threshold 0.16 is 0.6389121728602031
F1 score at threshold 0.17 is 0.6425638363147212
F1 score at threshold 0.18 is 0.6466739888257486
F1 score at threshold 0.19 is 0.6505685942414711
F1 score at threshold 0.2 is 0.6539384148430999
F1 score at threshold 0.21 is 0.6562778272484416
F1 score at threshold 0.22 is 0.6583745441830261
F1 score at threshold 0.23 is 0.661120354963949
F1 score at threshold 0.24 is 0.663270504330107
F1 score at threshold 0.25 is 0.6644723307961325
F1 score at threshold 0.26 is 0.6663895672052788
F1 score at threshold 0.27 is 0.667190693287219
F1 score at threshold 0.28 is 0.6673723207197672
F1 score at threshold 0.29 is 0.6694100042753313
F1 score at threshold 0.3

In [26]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [27]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Observations:

Overall pretrained embeddings seem to give better results comapred to non-pretrained model.
The performance of the different pretrained embeddings are almost similar.
Final Blend:

Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [29]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y 
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.590784155214228
F1 score at threshold 0.11 is 0.6008104531921932
F1 score at threshold 0.12 is 0.6088534413638858
F1 score at threshold 0.13 is 0.6159728487348025
F1 score at threshold 0.14 is 0.6229221347331583
F1 score at threshold 0.15 is 0.6286857905270179
F1 score at threshold 0.16 is 0.6345425013535463
F1 score at threshold 0.17 is 0.6393517670756271
F1 score at threshold 0.18 is 0.6441211896599649
F1 score at threshold 0.19 is 0.6487095260441108
F1 score at threshold 0.2 is 0.6526075805072671
F1 score at threshold 0.21 is 0.6565472187530023
F1 score at threshold 0.22 is 0.6591317002182877
F1 score at threshold 0.23 is 0.661601803744731
F1 score at threshold 0.24 is 0.6648809523809524
F1 score at threshold 0.25 is 0.66790278125783
F1 score at threshold 0.26 is 0.6707583058584835
F1 score at threshold 0.27 is 0.672371387579422
F1 score at threshold 0.28 is 0.6749354005167959
F1 score at threshold 0.29 is 0.6762785352348116
F1 score at threshold 0.3 i

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [30]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)