In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [4]:
print(len(train_df))
print(len(val_df))

1175509
130613


In [5]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)  

In [6]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd210d42710>

In [12]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
best_score = 0
best_threshold = None
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(val_y, (pred_noemb_val_y>thresh))
    print("F1 score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
        best_score = score
        best_threshold = thresh
    
print('Best score: {0}, best threshold: {1}'.format(best_score, best_threshold))

F1 score at threshold 0.1 is 0.5981284229591617
F1 score at threshold 0.11 is 0.6070876914250408
F1 score at threshold 0.12 is 0.6146689549272875
F1 score at threshold 0.13 is 0.6206165614487812
F1 score at threshold 0.14 is 0.6261752538548326
F1 score at threshold 0.15 is 0.6313624186758515
F1 score at threshold 0.16 is 0.6348402182385035
F1 score at threshold 0.17 is 0.6380088254251575
F1 score at threshold 0.18 is 0.6406045340050378
F1 score at threshold 0.19 is 0.6440886069473577
F1 score at threshold 0.2 is 0.6469489414694894
F1 score at threshold 0.21 is 0.648384040425308
F1 score at threshold 0.22 is 0.6505149687816852
F1 score at threshold 0.23 is 0.6520917678812416
F1 score at threshold 0.24 is 0.6534740064505548
F1 score at threshold 0.25 is 0.6542056074766356
F1 score at threshold 0.26 is 0.6555431131019037
F1 score at threshold 0.27 is 0.6554279572325621
F1 score at threshold 0.28 is 0.6568941823179112
F1 score at threshold 0.29 is 0.6561342592592593
F1 score at threshold 0

In [13]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [14]:
pred_test_y = (pred_noemb_test_y > best_threshold).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)