In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.compat.v1.keras.layers import CuDNNGRU

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Next steps are as follows:

Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
Fill up the missing values in the text column with 'na'
Tokenize the text column and convert them to vector sequences
Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [5]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

Without Pretrained Embeddings:

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. Please check out the model summary for the details of the layers used.

In [6]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17    

Train the model using train sample and monitor the metric on the valid sample. This is just a sample model running for 2 epochs. Changing the epochs, batch_size and model parameters might give us a better model.

In [7]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f023f831710>

Now let us get the validation sample predictions and also get the best threshold for F1 score.

In [8]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5878599544230371
F1 score at threshold 0.11 is 0.5958095238095238
F1 score at threshold 0.12 is 0.6026117789500995
F1 score at threshold 0.13 is 0.6094515289237966
F1 score at threshold 0.14 is 0.6161593453531766
F1 score at threshold 0.15 is 0.6212100425298395
F1 score at threshold 0.16 is 0.6264936997256706
F1 score at threshold 0.17 is 0.6296785141887433
F1 score at threshold 0.18 is 0.6324288171221096
F1 score at threshold 0.19 is 0.635914811229429
F1 score at threshold 0.2 is 0.6387676004513565
F1 score at threshold 0.21 is 0.6412714179289793
F1 score at threshold 0.22 is 0.6439127375087965
F1 score at threshold 0.23 is 0.6454600977198697
F1 score at threshold 0.24 is 0.6469346785401761
F1 score at threshold 0.25 is 0.6485613010842369
F1 score at threshold 0.26 is 0.6502135738016137
F1 score at threshold 0.27 is 0.6507064782724608
F1 score at threshold 0.28 is 0.6508346795907377
F1 score at threshold 0.29 is 0.6514391626689925
F1 score at threshold 0

Now let us get the test set predictions as well and save them

In [9]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



Now that our model building is done, it might be a good idea to clean up some memory before we go to the next step.

In [10]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

So we got some baseline GRU model without pre-trained embeddings. Now let us use the provided embeddings and rebuild the model again to see the performance.

We have four different types of embeddings.

GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
glove.840B.300d - https://nlp.stanford.edu/projects/glove/
paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html

A very good explanation for different types of embeddings are given in this kernel. Please refer the same for more details..

Glove Embeddings:

In this section, let us use the Glove embeddings and rebuild the GRU model.

In [11]:
!unzip drive/My\ Drive/embeddings.zip

Archive:  drive/My Drive/embeddings.zip
   creating: GoogleNews-vectors-negative300/
   creating: glove.840B.300d/
   creating: paragram_300_sl999/
   creating: wiki-news-300d-1M/
  inflating: glove.840B.300d/glove.840B.300d.txt  
  inflating: GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: paragram_300_sl999/README.txt  
  inflating: paragram_300_sl999/paragram_300_sl999.txt  


In [12]:
EMBEDDING_FILE = '/content/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8"))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  if self.run_code(code, result):


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17  

In [13]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f00b85936d0>

In [14]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5890056588520614
F1 score at threshold 0.11 is 0.5980675530597076
F1 score at threshold 0.12 is 0.6068354430379748
F1 score at threshold 0.13 is 0.6154110767113864
F1 score at threshold 0.14 is 0.6222183283686701
F1 score at threshold 0.15 is 0.6277079433003477
F1 score at threshold 0.16 is 0.6328666576050019
F1 score at threshold 0.17 is 0.6374850409647428
F1 score at threshold 0.18 is 0.642763772175537
F1 score at threshold 0.19 is 0.6466578299564477
F1 score at threshold 0.2 is 0.6510556621880998
F1 score at threshold 0.21 is 0.6548440384802253
F1 score at threshold 0.22 is 0.6590271971671667
F1 score at threshold 0.23 is 0.6615576013933815
F1 score at threshold 0.24 is 0.662945642882285
F1 score at threshold 0.25 is 0.6665311308767471
F1 score at threshold 0.26 is 0.6682063125481139
F1 score at threshold 0.27 is 0.6697461981626616
F1 score at threshold 0.28 is 0.6708237266820373
F1 score at threshold 0.29 is 0.673401385730153
F1 score at threshold 0.3

In [15]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [16]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Wiki News FastText Embeddings:

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [17]:
EMBEDDING_FILE = '/content/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8") if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  if self.run_code(code, result):


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17  

In [18]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f00c8eaf550>

In [19]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6250792213671345
F1 score at threshold 0.11 is 0.6310091148845602
F1 score at threshold 0.12 is 0.637918004912148
F1 score at threshold 0.13 is 0.643629547748354
F1 score at threshold 0.14 is 0.6486354537909486
F1 score at threshold 0.15 is 0.6532394086714951
F1 score at threshold 0.16 is 0.6560772092088067
F1 score at threshold 0.17 is 0.6592634099128929
F1 score at threshold 0.18 is 0.6610256939428336
F1 score at threshold 0.19 is 0.6635303956928545
F1 score at threshold 0.2 is 0.6653616210782498
F1 score at threshold 0.21 is 0.665881597259394
F1 score at threshold 0.22 is 0.6659088448630508
F1 score at threshold 0.23 is 0.6663386902809664
F1 score at threshold 0.24 is 0.6668877099911582
F1 score at threshold 0.25 is 0.6678571428571428
F1 score at threshold 0.26 is 0.6682833295736522
F1 score at threshold 0.27 is 0.6693263421862532
F1 score at threshold 0.28 is 0.670040835106689
F1 score at threshold 0.29 is 0.6697247706422018
F1 score at threshold 0.3 

In [20]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Paragram Embeddings:

In this section, we can use the paragram embeddings and build the model and make predictions.

In [26]:
EMBEDDING_FILE = '/content/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  if self.run_code(code, result):


Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17  

In [27]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f00c5bf2b10>

In [28]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5789034896184843
F1 score at threshold 0.11 is 0.5883069922398734
F1 score at threshold 0.12 is 0.5965188561956071
F1 score at threshold 0.13 is 0.6034919028340081
F1 score at threshold 0.14 is 0.6107053870607441
F1 score at threshold 0.15 is 0.6169162918649841
F1 score at threshold 0.16 is 0.6226264418811003
F1 score at threshold 0.17 is 0.626675060706898
F1 score at threshold 0.18 is 0.6311180039228208
F1 score at threshold 0.19 is 0.6353897303287772
F1 score at threshold 0.2 is 0.6396404831008332
F1 score at threshold 0.21 is 0.6432260815997726
F1 score at threshold 0.22 is 0.6464016876827924
F1 score at threshold 0.23 is 0.6499417701863355
F1 score at threshold 0.24 is 0.6528009418228196
F1 score at threshold 0.25 is 0.6550235673530141
F1 score at threshold 0.26 is 0.6572115866492932
F1 score at threshold 0.27 is 0.6600800526929117
F1 score at threshold 0.28 is 0.6622340425531914
F1 score at threshold 0.29 is 0.6643259152166054
F1 score at threshold 0

In [30]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [31]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Observations:

Overall pretrained embeddings seem to give better results comapred to non-pretrained model.
The performance of the different pretrained embeddings are almost similar.
Final Blend:

Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [32]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y 
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5984329490913565
F1 score at threshold 0.11 is 0.6074950690335306
F1 score at threshold 0.12 is 0.6144501005261581
F1 score at threshold 0.13 is 0.6222183436600052
F1 score at threshold 0.14 is 0.6299296715036055
F1 score at threshold 0.15 is 0.635655403569164
F1 score at threshold 0.16 is 0.6410268206284215
F1 score at threshold 0.17 is 0.6470945724837549
F1 score at threshold 0.18 is 0.6515734016801936
F1 score at threshold 0.19 is 0.6558798035249928
F1 score at threshold 0.2 is 0.6603598771151314
F1 score at threshold 0.21 is 0.6643273697736484
F1 score at threshold 0.22 is 0.6679337768218877
F1 score at threshold 0.23 is 0.6707848101265822
F1 score at threshold 0.24 is 0.6718158567774937
F1 score at threshold 0.25 is 0.6726680117957473
F1 score at threshold 0.26 is 0.6747214812490193
F1 score at threshold 0.27 is 0.6757070493879274
F1 score at threshold 0.28 is 0.6772955102911379
F1 score at threshold 0.29 is 0.6795196294900103
F1 score at threshold 0

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [33]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)