In [2]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html


In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
link = 'https://drive.google.com/open?id=1udJeq9cYA5BTagIh2I_TcBiqYn8FxJaj'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')  
train_df = pd.read_csv('train.csv',sep='~')
link = 'https://drive.google.com/open?id=1P0NgTOvLwUznerU0grO-dU-6ASCKgeBn'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test.csv')  
test_df = pd.read_csv('test.csv',sep='~')
test_df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,9602,A friend and I stayed in this hotel when we we...,Edge,Desktop
1,8749,I enjoy staying here when I have early flights...,Google Chrome,Mobile
2,15500,I stopped off in Seattle during a train tour o...,Chrome,Mobile
3,5495,I have stayed at this hotel - or - times now f...,Mozilla Firefox,Desktop
4,18570,Excellent location with hop on hop off city tr...,Edge,Mobile


In [3]:
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D ,GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


Next steps are as follows:

Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
Fill up the missing values in the text column with 'na'
Tokenize the text column and convert them to vector sequences
Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [0]:
stars_dict = {'Good':1,'Bad':0}
train_df["Is_Response"] = train_df['Is_Response'].replace(stars_dict,regex=True)

In [0]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=29)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["Description"].fillna("_na_").values
val_X = val_df["Description"].fillna("_na_").values
test_X = test_df["Description"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['Is_Response'].values
val_y = val_df['Is_Response'].values

Without Pretrained Embeddings:

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. Please check out the model summary for the details of the layers used.

In [8]:

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)  

In [9]:

## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 27154 samples, validate on 3018 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f72a3aa6b00>

Now let us get the validation sample predictions and also get the best threshold for F1 score.

In [10]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.8302114803625378
F1 score at threshold 0.11 is 0.839169381107492
F1 score at threshold 0.12 is 0.8457588827274595
F1 score at threshold 0.13 is 0.8531671858774662
F1 score at threshold 0.14 is 0.8619457869300273
F1 score at threshold 0.15 is 0.8690299299511781
F1 score at threshold 0.16 is 0.8747855917667239
F1 score at threshold 0.17 is 0.8792207792207791
F1 score at threshold 0.18 is 0.8840358156802796
F1 score at threshold 0.19 is 0.8859649122807017
F1 score at threshold 0.2 is 0.8877618522601984
F1 score at threshold 0.21 is 0.8909574468085106
F1 score at threshold 0.22 is 0.8927458834000891
F1 score at threshold 0.23 is 0.8936550491510278
F1 score at threshold 0.24 is 0.8954219030520646
F1 score at threshold 0.25 is 0.8958380202474692
F1 score at threshold 0.26 is 0.8966606498194948
F1 score at threshold 0.27 is 0.8974417025130179
F1 score at threshold 0.28 is 0.899432463110102
F1 score at threshold 0.29 is 0.90154968094804
F1 score at threshold 0.3 

In [11]:
#Now let us get the test set predictions as well and save them
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [0]:
link = 'https://drive.google.com/open?id=1Dci_Yju-vCMM9o7yEU9p7QEND28PUmFs'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('sample_submission.csv')  
submission_1 = pd.read_csv('sample_submission.csv',sep='~')

In [13]:
check_1 = (pred_noemb_test_y>0.15).astype(int)
out_df = pd.DataFrame({"qid":test_df["User_ID"].values})
out_df['Is_Response'] = check_1
stars_dict = {1:'Good',0:'Bad'}
out_df["Is_Response"] = out_df['Is_Response'].replace(stars_dict,regex=True)
out_df.to_csv("submission_1.csv", index=False)
out_df.head()

Unnamed: 0,qid,Is_Response
0,9602,Good
1,8749,Good
2,15500,Good
3,5495,Good
4,18570,Good
