In [1]:
import enchant
import enchant.checker as checker
from keras.preprocessing.text import Tokenizer
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, LSTM, MaxPooling1D, Merge
from keras.layers import Dropout, Activation, Dense, Flatten, BatchNormalization, Reshape
from keras.models import Sequential
from keras.preprocessing import sequence
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
import json
import os
import csv
import pickle
import keras
from keras.utils.np_utils import to_categorical

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1050 Ti (CNMeM is disabled, cuDNN 5110)


In [208]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb

## Preprocessing Text Data

In [2]:
def clean_text(text):
    # Remove punctuations
    text = re.sub(r'[^A-Za-z0-9\s]', r'', text)
    # Remove newlines
    text = text.replace('\n', ' ')
    # Convert everything to lowercase
    return text.lower().strip()

In [3]:
tripadvisor = pd.read_csv('tripadvisor_1245-m.csv')
tripadvisor.head()

Unnamed: 0,Review,Rating
0,"Usually stay near the airport, but this trip w...",5
1,Stayed at this Hilton for 2 nights. It was lik...,4
2,"Stayed there one night, December 16, on the wa...",4
3,I just stayed here last weekend and have alrea...,5
4,My mother who is 90 and I stayed one night on ...,5


In [4]:
len(tripadvisor)

724868

In [5]:
tripadvisor['Review'] = tripadvisor['Review'].apply(clean_text)

In [6]:
tripadvisor['Rating'] = tripadvisor['Rating'].apply(lambda x: 0 if x in ('1', '2') else 1)

In [7]:
tripadvisor.head()

Unnamed: 0,Review,Rating
0,usually stay near the airport but this trip we...,1
1,stayed at this hilton for 2 nights it was like...,1
2,stayed there one night december 16 on the way ...,1
3,i just stayed here last weekend and have alrea...,1
4,my mother who is 90 and i stayed one night on ...,1


In [8]:
tripadvisor = tripadvisor.sample(frac=1).reset_index(drop=True)

In [9]:
tripadvisor.head()

Unnamed: 0,Review,Rating
0,for our first visit to paris we decided to sta...,1
1,though it has a terrific location and occupies...,0
2,my first time to paris and i stayed at the ren...,1
3,this hotel caters to american tourists on crui...,0
4,cest un hotel a oublier tres vite,0


In [10]:
print(len(tripadvisor[tripadvisor['Rating'] == 1]))
print(len(tripadvisor[tripadvisor['Rating'] == 0]))

500609
224259


## Joint Learning of TripAdvisor Model

In [11]:
MAX_FEATURES = 6000
MAXLEN = 500
BATCH_SIZE = 256
EMBEDDING_DIMS = 300
FILTERS_1 = 250
FILTERS_2 = 500
HIDDEN_DIMS = 250
EPOCHS = 40
KERNEL_SIZE = 3
PRETRAINING_EPOCHS = 20

In [12]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(tripadvisor['Review'])

In [15]:
# Save tokenizer
with open('tokenizer-1234-m.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [3]:
# Load Tokenizer
tokenizer = pickle.load(open('tokenizer-1234.pkl', 'rb'))

In [15]:
tripadvisor_bow = tokenizer.texts_to_sequences(tripadvisor['Review'])

In [13]:
tripadvisor_target = tripadvisor['Rating']

In [16]:
tripadvisor_bow = sequence.pad_sequences(tripadvisor_bow, MAXLEN)

In [17]:
tripadvisor_bow.shape

(724868, 500)

In [18]:
x_train = tripadvisor_bow

In [19]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, tripadvisor_target, 
                                                            random_state=42, test_size=0.01)

In [20]:
print(x_train.shape)
print(y_train.shape)
print(x_validate.shape)
print(y_validate.shape)

(717619, 500)
(717619,)
(7249, 500)
(7249,)


In [6]:
train = pd.read_csv('train.csv')
train['Description'] = train['Description'].apply(clean_text)

In [7]:
train['Is_Response'] = train['Is_Response'].apply(lambda x: 1 if x == 'happy' else 0)

In [8]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,the room was kind of clean but had a very stro...,Edge,Mobile,0
1,id10327,i stayed at the crown plaza april april th...,Internet Explorer,Mobile,0
2,id10328,i booked this hotel through hotwire at the low...,Mozilla,Tablet,0
3,id10329,stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1
4,id10330,my girlfriends and i stayed here to celebrate ...,Edge,Tablet,0


In [9]:
train_bow = tokenizer.texts_to_sequences(train['Description'])
train_bow = sequence.pad_sequences(train_bow, MAXLEN)

In [11]:
train_bow.shape

(38932, 500)

In [12]:
train_y = train['Is_Response']
print(len(train_y))

38932


In [13]:
train_hx, val_hx, train_hy, val_hy = train_test_split(train_bow, train_y, 
                                                      random_state=42, test_size=0.1)
print(train_hx.shape)
print(val_hy.shape)
print(val_hx.shape)
print(val_hy.shape)

(35038, 500)
(3894,)
(3894, 500)
(3894,)


## ConvNet with 2 Convolutional Layers

In [44]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
model.add(Dropout(0.2))
model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_DIMS))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 200)          1200000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 500, 200)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 498, 250)          150250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 249, 250)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 247, 500)          375500    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 500)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               125250    
__________

In [28]:
# Monitor training accuracy and save the overtrained models
fpath = 'data-aug-joint-1/data-aug-2-convs-tripadvisor-overtrained-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, monitor='acc', verbose=1, save_best_only=True)
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
          validation_data=(x_validate, y_validate), callbacks=[checkpoint])

Train on 358899 samples, validate on 39878 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
  2176/358899 [..............................] - ETA: 12:52 - loss: 0.0219 - acc: 0.9913

KeyboardInterrupt: 

In [30]:
model.save('data-aug-joint-2/data-aug-2-convs-tripadvisor-11-0.967.h5py')

## Finetune all but the Embedding Layer of the Model (#1)
The model trained on TripAdvisor dataset is fine-tuned. Use the model from the last iteration of the training procedure for TripAdvisor dataset

In [47]:
model.layers[0].trainable = False
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
              metrics=['accuracy'])

In [48]:
fpath = 'data-aug-joint-2/finetuned-1-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100

KeyboardInterrupt: 

In [19]:
def first_model(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [50]:
model = first_model('data-aug-joint-2/finetuned-1-06-0.903.h5py')

In [25]:
test = pd.read_csv('test.csv')
test['Description'] = test['Description'].apply(clean_text)
test_x = tokenizer.texts_to_sequences(test['Description'])
test_x = sequence.pad_sequences(test_x, MAXLEN)

In [55]:
test_y = model.predict_classes(test_x)



In [56]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,1
3,id80135,0
4,id80136,1


In [57]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [58]:
submission.to_csv('submission-joint-learning-2-convs-1.csv')

## Finetune all but the Embedding Layer of the Model (#2)
The model trained on TripAdvisor dataset is fine-tuned. Use the model from the last iteration of the training procedure for TripAdvisor dataset. Use the best model obtained when trained on TripAdvisor dataset

In [20]:
model = first_model('data-aug-joint-2/data-aug-2-convs-tripadvisor-07-0.968.h5py')

In [27]:
model.layers[0].trainable = False
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
              metrics=['accuracy'])

In [28]:
fpath = 'data-aug-joint-2/finetuned-2-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

KeyboardInterrupt: 

In [29]:
model = first_model('data-aug-joint-2/finetuned-2-08-0.901.h5py')

In [33]:
test_y = model.predict_classes(test_x)



In [34]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,0
3,id80135,0
4,id80136,1


In [35]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,not_happy
3,id80135,not_happy
4,id80136,happy


In [36]:
submission.to_csv('submission-joint-learning-2-convs-2.csv')

## TripAdvisor Overtrained models

In [17]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
model.add(Dropout(0.2))
model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_DIMS))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 200)          1200000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 200)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 498, 250)          150250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 249, 250)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 247, 500)          375500    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 500)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               125250    
__________

In [None]:
# Monitor training accuracy and save the overtrained models
fpath = 'data-aug-joint-1/data-aug-2-convs-tripadvisor-overtrained-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, monitor='acc', verbose=1, save_best_only=True)
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=PRETRAINING_EPOCHS, 
          validation_data=(x_validate, y_validate), callbacks=[checkpoint])

Train on 394789 samples, validate on 3988 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30

## Finetuning Overtrained Models (#4)
Don't freeze any layer and finetune the TripAdvisor model on HackerEarth data.

In [15]:
def second_model(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [16]:
model = second_model('data-aug-joint-1/data-aug-2-convs-tripadvisor-overtrained-14-0.993.h5py')

In [17]:
fpath = 'data-aug-joint-2/finetuned-4-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

KeyboardInterrupt: 

## Finetuning Overtrained Models (#4)
Freeze the Embedding layer and finetune the TripAdvisor model on HackerEarth data.

In [21]:
model = second_model('data-aug-joint-1/data-aug-2-convs-tripadvisor-overtrained-13-0.993.h5py')

In [22]:
model.layers[0].trainable = False
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])

In [23]:
fpath = 'data-aug-joint-2/finetuned-5-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
 2752/35038 [=>............................] - ETA: 58s - loss: 0.0269 - acc: 0.9916

KeyboardInterrupt: 

In [24]:
model = second_model('data-aug-joint-2/finetuned-5-04-0.901.h5py')

In [28]:
test_y = model.predict_classes(test_x)



In [29]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,0
3,id80135,0
4,id80136,1


In [30]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,not_happy
3,id80135,not_happy
4,id80136,happy


In [31]:
submission.to_csv('submission-joint-learning-data-augmentation-overtrained-fintuned-5.csv')

## Convolutional Model With 4 Convolution Layers and 300-d Embedding

In [65]:
def conv_4_e300(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(HIDDEN_DIMS, activation='relu') )
    model.add(Dropout(0.5))
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [36]:
model = conv_4_e300()

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 300)          1800000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 500, 300)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 498, 250)          225250    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 249, 250)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 247, 250)          187750    
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 123, 250)          0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 121, 500)          375500    
__________

In [38]:
fpath = 'data-aug-joint-3/data-aug-4-convs-tripadvisor-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='acc', save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_validate, y_validate), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 717619 samples, validate on 7249 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
  6912/717619 [..............................] - ETA: 38:52 - loss: 0.0297 - acc: 0.9893

KeyboardInterrupt: 

In [39]:
model.save('data-aug-joint-3/data-aug-4-convs-tripadvisor-06-interrupted.h5py')

In [40]:
train = pd.read_csv('train.csv')
train['Description'] = train['Description'].apply(clean_text)

In [41]:
train_bow = tokenizer.texts_to_sequences(train['Description'])

In [42]:
train['Is_Response'] = train['Is_Response'].apply(lambda x: 1 if x == 'happy' else 0)

In [43]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,the room was kind of clean but had a very stro...,Edge,Mobile,0
1,id10327,i stayed at the crown plaza april april th...,Internet Explorer,Mobile,0
2,id10328,i booked this hotel through hotwire at the low...,Mozilla,Tablet,0
3,id10329,stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1
4,id10330,my girlfriends and i stayed here to celebrate ...,Edge,Tablet,0


In [44]:
train_y = train['Is_Response']

In [46]:
train_bow = sequence.pad_sequences(train_bow, MAXLEN)

In [47]:
train_bow.shape

(38932, 500)

In [48]:
train_hx, val_hx, train_hy, val_hy = train_test_split(train_bow, train_y, 
                                                      random_state=42, test_size=0.1)
print(train_hx.shape)
print(val_hy.shape)
print(val_hx.shape)
print(val_hy.shape)

(35038, 500)
(3894,)
(3894, 500)
(3894,)


In [49]:
fpath = 'data-aug-joint-3/finetuned-1-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
 7936/35038 [=====>........................] - ETA: 1:28 - loss: 0.0184 - acc: 0.9937

KeyboardInterrupt: 

In [50]:
model = conv_4_e300('data-aug-joint-3/finetuned-1-02-0.906.h5py')

In [51]:
test = pd.read_csv('test.csv')
test['Description'] = test['Description'].apply(clean_text)

In [52]:
test_x = tokenizer.texts_to_sequences(test['Description'])

In [53]:
test_x = sequence.pad_sequences(test_x, MAXLEN)

In [54]:
test_y = model.predict_classes(test_x)



In [55]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,1
3,id80135,0
4,id80136,1


In [56]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [57]:
submission.to_csv('submission-joint-learning-4-convs-1-7L.csv')

## Convolutional Model With 6 Convolution Layers and 300-d Embedding

In [58]:
MAX_FEATURES = 6000
MAXLEN = 500
BATCH_SIZE = 350
EMBEDDING_DIMS = 400
FILTERS_1 = 128
FILTERS_2 = 256
FILTERS_3 = 512
HIDDEN_DIMS = 250
EPOCHS = 40
KERNEL_SIZE = 3
PRETRAINING_EPOCHS = 20

In [63]:
def conv_6_e400(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_3, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_3, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(HIDDEN_DIMS * 2, activation='relu') )
    model.add(Dropout(0.5))
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [64]:
model = conv_6_e400()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 400)          2400000   
_________________________________________________________________
dropout_18 (Dropout)         (None, 500, 400)          0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 498, 128)          153728    
_________________________________________________________________
max_pooling1d_33 (MaxPooling (None, 249, 128)          0         
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 247, 128)          49280     
_________________________________________________________________
max_pooling1d_34 (MaxPooling (None, 123, 128)          0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 121, 256)          98560     
__________

In [65]:
fpath = 'data-aug-joint-3/data-aug-6-convs-tripadvisor-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='acc', save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_validate, y_validate), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 717619 samples, validate on 7249 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
 36050/717619 [>.............................] - ETA: 27:53 - loss: 0.0970 - acc: 0.9621

KeyboardInterrupt: 

In [66]:
fpath = 'data-aug-joint-3/finetuned-2-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
 3500/35038 [=>............................] - ETA: 1:17 - loss: 0.1790 - acc: 0.9366

KeyboardInterrupt: 

## Finetuning the 4 Conv Layer Model (Last Saved) by Freezing the Embedding Layer (Finetuning #3)

In [71]:
model = conv_4_e300('data-aug-joint-3/data-aug-4-convs-tripadvisor-06-interrupted.h5py')

In [72]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 300)          1800000   
_________________________________________________________________
dropout_27 (Dropout)         (None, 500, 300)          0         
_________________________________________________________________
conv1d_47 (Conv1D)           (None, 498, 250)          225250    
_________________________________________________________________
max_pooling1d_47 (MaxPooling (None, 249, 250)          0         
_________________________________________________________________
conv1d_48 (Conv1D)           (None, 247, 250)          187750    
_________________________________________________________________
max_pooling1d_48 (MaxPooling (None, 123, 250)          0         
_________________________________________________________________
conv1d_49 (Conv1D)           (None, 121, 500)          375500    
__________

In [74]:
model.layers[0].trainable = False

In [77]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])

In [78]:
fpath = 'data-aug-joint-3/finetuned-3-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40

KeyboardInterrupt: 

In [79]:
model = conv_4_e300('data-aug-joint-3/finetuned-3-02-0.907.h5py')

In [81]:
test_y = model.predict_classes(test_x)



In [83]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)

In [84]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [85]:
submission.to_csv('submission-joint-learning-4-convs-2-7L.csv')

## Finetuning the Best 4 Conv Layer Model (Finetuning #4)

In [86]:
model = conv_4_e300('data-aug-joint-3/data-aug-4-convs-tripadvisor-05-0.982.h5py')

In [87]:
fpath = 'data-aug-joint-3/finetuned-4-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
 7424/35038 [=====>........................] - ETA: 1:30 - loss: 0.0191 - acc: 0.9937

KeyboardInterrupt: 

In [88]:
model = conv_4_e300('data-aug-joint-3/finetuned-4-01-0.905.h5py')
test_y = model.predict_classes(test_x)



In [89]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,1
3,id80135,0
4,id80136,1


In [90]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [91]:
submission.to_csv('submission-joint-learning-4-convs-3-7L.csv')

## Convolutional Model With 4 Convolution Layers and 300-d Embedding

In [12]:
MAX_FEATURES = 6000
MAXLEN = 500
BATCH_SIZE = 256
EMBEDDING_DIMS = 300
FILTERS_1 = 256
FILTERS_2 = 512
HIDDEN_DIMS = 256
EPOCHS = 40
KERNEL_SIZE = 3
PRETRAINING_EPOCHS = 20

In [16]:
tripadvisor_target = tripadvisor['Rating']

In [17]:
x_train = tokenizer.texts_to_sequences(tripadvisor['Review'])

In [18]:
x_train = sequence.pad_sequences(x_train, MAXLEN)

In [26]:
# Add a batch norm layer after convolution
def conv_4_e300_2(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(HIDDEN_DIMS, activation='relu') )
    model.add(Dropout(0.5))
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [27]:
model = conv_4_e300_2()

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 300)          1800000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 300)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 498, 256)          230656    
_________________________________________________________________
batch_normalization_5 (Batch (None, 498, 256)          1024      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 249, 256)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 247, 256)          196864    
_________________________________________________________________
batch_normalization_6 (Batch (None, 247, 256)          1024      
__________

In [22]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, tripadvisor_target, 
                                                            random_state=42, test_size=0.01)

In [23]:
print(x_train.shape)
print(y_train.shape)
print(x_validate.shape)
print(y_validate.shape)

(717619, 500)
(717619,)
(7249, 500)
(7249,)


In [29]:
fpath = 'data-aug-joint-3/data-aug-4-convs-batchnorm-tripadvisor-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='acc', save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_validate, y_validate), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 717619 samples, validate on 7249 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
  6656/717619 [..............................] - ETA: 49:27 - loss: 0.0411 - acc: 0.9865

KeyboardInterrupt: 

In [30]:
model.save('data-aug-joint-3/data-aug-4-convs-batchnorm-tripadvisor-06-interrupted-0.981.h5py')

## Finetune the ConvNet with BatchNorm without freezing any layer (#6)

In [32]:
train = pd.read_csv('train.csv')
train['Description'] = train['Description'].apply(clean_text)
train['Is_Response'] = train['Is_Response'].apply(lambda x: 1 if x == 'happy' else 0)
train_bow = tokenizer.texts_to_sequences(train['Description'])
train_bow = sequence.pad_sequences(train_bow, MAXLEN)

In [33]:
train_y = train['Is_Response']
train_hx, val_hx, train_hy, val_hy = train_test_split(train_bow, train_y, 
                                                      random_state=42, test_size=0.1)
print(train_hx.shape)
print(val_hy.shape)
print(val_hx.shape)
print(val_hy.shape)

(35038, 500)
(3894,)
(3894, 500)
(3894,)


In [34]:
fpath = 'data-aug-joint-3/finetuned-6-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
 2304/35038 [>.............................] - ETA: 2:17 - loss: 0.0147 - acc: 0.9957

KeyboardInterrupt: 

In [35]:
fpath = 'data-aug-joint-3/data-aug-4-convs-batchnorm-II-tripadvisor-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='acc', save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_validate, y_validate), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 717619 samples, validate on 7249 samples
Epoch 1/40
Epoch 2/40
  1024/717619 [..............................] - ETA: 52:09 - loss: 0.0454 - acc: 0.9795

KeyboardInterrupt: 

In [36]:
# Now that we have done an iteration on supplementary data, 
# make one more pass on the training data
fpath = 'data-aug-joint-3/finetuned-7-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy, validation_data=(val_hx, val_hy), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
 1024/35038 [..............................] - ETA: 2:21 - loss: 0.0143 - acc: 0.9951

KeyboardInterrupt: 

In [37]:
model = conv_4_e300_2('data-aug-joint-3/finetuned-6-01-0.904.h5py')

In [38]:
test = pd.read_csv('test.csv')
test['Description'] = test['Description'].apply(clean_text)
test_x = tokenizer.texts_to_sequences(test['Description'])
test_x = sequence.pad_sequences(test_x, MAXLEN)

In [39]:
test_y = model.predict_classes(test_x)



In [40]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [41]:
submission.to_csv('submission-joint-learning-4-convs-4-7L.csv')

## 4 Convolutions without BatchNorm. One Hot Encoded Labels.

In [50]:
MAX_FEATURES = 6000
MAXLEN = 500
BATCH_SIZE = 256
EMBEDDING_DIMS = 300
FILTERS_1 = 250
FILTERS_2 = 500
HIDDEN_DIMS = 250
EPOCHS = 40
KERNEL_SIZE = 3
PRETRAINING_EPOCHS = 20

In [51]:
def conv_4_e300_1hot(model_weights=None):
    model = Sequential()
    model.add(Embedding(MAX_FEATURES, EMBEDDING_DIMS, input_length=MAXLEN))
    model.add(Dropout(0.2))
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_1, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(FILTERS_2, KERNEL_SIZE, padding='valid', strides=1, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(HIDDEN_DIMS, activation='relu') )
    model.add(Dropout(0.5))
    model.add(Dense(HIDDEN_DIMS))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=3e-4), 
                  metrics=['accuracy'])
    if model_weights is not None:
        model.load_weights(model_weights)
    return model

In [52]:
y_train_1hot = to_categorical(y_train, num_classes=2)
y_validate_1hot = to_categorical(y_validate, num_classes=2)
print(y_train_1hot.shape)
print(y_validate_1hot.shape)

(717619, 2)
(7249, 2)


In [53]:
model = conv_4_e300_1hot()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 300)          1800000   
_________________________________________________________________
dropout_13 (Dropout)         (None, 500, 300)          0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 498, 250)          225250    
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 249, 250)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 247, 250)          187750    
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 123, 250)          0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 121, 500)          375500    
__________

In [54]:
fpath = 'data-aug-joint-3/data-aug-4-convs-tripadvisor-1hot-{epoch:02d}-{acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='acc', save_best_only=True)
model.fit(x_train, y_train_1hot, validation_data=(x_validate, y_validate_1hot), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 717619 samples, validate on 7249 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
 28416/717619 [>.............................] - ETA: 37:16 - loss: 0.0350 - acc: 0.9869

KeyboardInterrupt: 

In [57]:
val_hy1hot = to_categorical(val_hy, num_classes=2)
train_hy1hot = to_categorical(train_hy, num_classes=2)
fpath = 'data-aug-joint-3/finetuned-8-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, monitor='val_acc', save_best_only=True)
model.fit(train_hx, train_hy1hot, validation_data=(val_hx, val_hy1hot), 
          callbacks=[checkpoint], batch_size=BATCH_SIZE, epochs=EPOCHS)

Train on 35038 samples, validate on 3894 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40

KeyboardInterrupt: 

In [58]:
model = conv_4_e300_1hot('data-aug-joint-3/finetuned-8-02-0.907.h5py')

In [59]:
test_y = model.predict_classes(test_x)



In [62]:
submission = pd.concat([test['User_ID'], pd.Series(test_y.flatten())], axis=1)
submission.head()

Unnamed: 0,User_ID,0
0,id80132,0
1,id80133,1
2,id80134,1
3,id80135,0
4,id80136,1


In [63]:
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [64]:
submission.to_csv('submission-joint-learning-4-convs-5-7L.csv')

## Use other features along with text predictions

In [219]:
model = conv_4_e300('data-aug-joint-3/finetuned-1-02-0.906.h5py')

In [66]:
train = pd.read_csv('train.csv')
train['Description'] = train['Description'].apply(clean_text)

In [67]:
train_bow = tokenizer.texts_to_sequences(train['Description'])
train_bow = sequence.pad_sequences(train_bow, MAXLEN)

In [68]:
predicted_y = model.predict(train_bow)

In [70]:
train['Predicted'] = pd.Series(predicted_y.flatten())

In [221]:
train['Predicted_Class'] = pd.Series(model.predict_classes(train_bow).flatten())



In [222]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Predicted,Browser_Val,Device_Val,Predicted_Class
0,id10326,the room was kind of clean but had a very stro...,Edge,Mobile,not happy,0.099726,1,1,0
1,id10327,i stayed at the crown plaza april april th...,Internet Explorer,Mobile,not happy,0.103372,5,1,0
2,id10328,i booked this hotel through hotwire at the low...,Mozilla,Tablet,not happy,0.007604,7,2,0
3,id10329,stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0.999781,6,0,1
4,id10330,my girlfriends and i stayed here to celebrate ...,Edge,Tablet,not happy,0.208506,1,2,0


In [73]:
browser_labels = LabelEncoder()
browser_labels.fit(train['Browser_Used'])

LabelEncoder()

In [74]:
browser_vals = browser_labels.transform(train['Browser_Used'])

In [75]:
train['Browser_Val'] = pd.Series(browser_vals)

In [77]:
device_labels = LabelEncoder()
device_vals = device_labels.fit_transform(train['Device_Used'])

In [78]:
train['Device_Val'] = pd.Series(device_vals)

In [79]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Predicted,Browser_Val,Device_Val
0,id10326,the room was kind of clean but had a very stro...,Edge,Mobile,not happy,0.099726,1,1
1,id10327,i stayed at the crown plaza april april th...,Internet Explorer,Mobile,not happy,0.103372,5,1
2,id10328,i booked this hotel through hotwire at the low...,Mozilla,Tablet,not happy,0.007604,7,2
3,id10329,stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0.999781,6,0
4,id10330,my girlfriends and i stayed here to celebrate ...,Edge,Tablet,not happy,0.208506,1,2


In [80]:
train.to_csv('train-modified.csv')

In [81]:
log_x_train = train[['Predicted', 'Browser_Val', 'Device_Val']]

In [83]:
log_y_train = train['Is_Response'].apply(lambda x: 1 if x == 'happy' else 0)

## XGBoost

In [109]:
lxtrain, lxval, lytrain, lyval = train_test_split(log_x_train, log_y_train, 
                                                  random_state=42, test_size=0.1)

In [110]:
xgbc = XGBClassifier()
xgbc.fit(lxtrain, lytrain)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [248]:
xgbc = XGBClassifier()
xgbc.fit(cxtrain, cytrain)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [250]:
predy = xgbc.predict(cxval)

In [251]:
accuracy_score(predy, cyval)

0.89445300462249611

In [120]:
test = pd.read_csv('test.csv')
test['Description'] = test['Description'].apply(clean_text)
ltext_x = tokenizer.texts_to_sequences(test['Description'])
ltext_x = sequence.pad_sequences(ltext_x, MAXLEN)

In [121]:
ltext_y = model.predict(ltext_x)

In [122]:
test['Predicted'] = pd.Series(ltext_y.flatten())

In [123]:
test['Browser_Val'] = pd.Series(pd.Series(browser_labels.transform(test['Browser_Used'])))

In [124]:
test['Device_Val'] = pd.Series(pd.Series(device_labels.transform(test['Device_Used'])))

In [125]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Predicted,Browser_Val
0,id80132,looking for a motel in close proximity to tv t...,Firefox,1,0.003549,2
1,id80133,walking distance to madison square garden and ...,InternetExplorer,0,0.89773,6
2,id80134,visited seattle on business spent nights in t...,IE,2,0.589763,4
3,id80135,this hotel location is excellent and the rooms...,Edge,1,0.01171,1
4,id80136,this hotel is awesome i love the service antho...,Mozilla,1,0.997413,7


In [129]:
log_x_test = test[['Predicted', 'Browser_Val', 'Device_Used']]

In [130]:
log_x_test.columns = ['Predicted', 'Browser_Val', 'Device_Val']

In [131]:
predictions = xgbc.predict(log_x_test)

In [132]:
submission = pd.concat([test['User_ID'], pd.Series(predictions.flatten())], axis=1)
submission.columns = ['User_ID', 'Is_Response']
submission['Is_Response'] = submission['Is_Response'].apply(lambda x: 'happy' if x == 1 else 'not_happy')
submission.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [133]:
submission.to_csv('submission-xgboost.csv')

## Logistic Regression

In [134]:
logreg = LogisticRegression()

In [136]:
logreg.fit(lxtrain, lytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [137]:
ys = logreg.predict(lxval)
accuracy_score(ys, lyval)

0.89753466872110943

In [143]:
print(min(train['Browser_Val']))
print(max(train['Browser_Val']))
print(min(train['Device_Val']))
print(max(train['Device_Val']))

0
10
0
2


## Light Gradient Boosting

In [214]:
d_train = lgb.Dataset(lxtrain, label=lytrain)

In [215]:
params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [216]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.681317 + 3.07796e-05
[40]	cv_agg's binary_error: 0.681317 + 3.07796e-05


In [217]:
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [218]:
## train the model
lgbmodel = lgb.train(params, d_train, num_boost_round=nround)

UnboundLocalError: local variable 'evaluation_result_list' referenced before assignment

## CatBoost

In [223]:
cat_x_train = train[['Predicted_Class', 'Browser_Val', 'Device_Val']]
cat_y_train = train['Is_Response'].apply(lambda x: 1 if x == 'happy' else 0)

In [224]:
cxtrain, cxval, cytrain, cyval = train_test_split(cat_x_train, cat_y_train, 
                                                  random_state=42, test_size=0.1)

In [231]:
cxtrain.dtypes

Predicted_Class    int32
Browser_Val        int64
Device_Val         int64
dtype: object

In [236]:
from catboost import CatBoostClassifier,cv, Pool

In [232]:
cat_cols = [0, 1, 2]
param = {
    'use_best_model':True,
    'loss_function':'CrossEntropy',
    'eval_metric':'Accuracy',
    'iterations':1000,
    'depth':6,
    'learning_rate':0.03,
    'rsm':0.3,
    'random_seed':2017,
    
    
}

In [233]:
my_dt =  Pool(cxtrain, 
           label=cytrain,
           cat_features=cat_cols,
           column_description=None,
           delimiter='\t',
           has_header=None,
           weight=None, 
           baseline=None,
           feature_names=None,
           thread_count=1)

In [238]:
ctb_cv = cv(param, my_dt, fold_count=5)



0: 0: 0: 0: 0: 1: 1: 1: 1: 1: 2: 2: 2: 2: 2: 3: 3: 3: 3: 3: 4: 4: 4: 4: 4: 5: 5: 5: 5: 5: 6: 6: 6: 6: 6: 7: 7: 7: 7: 7: 8: 8: 8: 8: 8: 9: 9: 9: 9: 9: 10: 10: 10: 10: 10: 11: 11: 11: 11: 11: 12: 12: 12: 12: 12: 13: 13: 13: 13: 13: 14: 14: 14: 14: 14: 15: 15: 15: 15: 15: 16: 16: 16: 16: 16: 17: 17: 17: 17: 17: 18: 18: 18: 18: 18: 19: 19: 19: 19: 19: 20: 20: 20: 20: 20: 21: 21: 21: 21: 21: 22: 22: 22: 22: 22: 23: 23: 23: 23: 23: 24: 24: 24: 24: 24: 25: 25: 25: 25: 25: 26: 26: 26: 26: 26: 27: 27: 27: 27: 27: 28: 28: 28: 28: 28: 29: 29: 29: 29: 29: 30: 30: 30: 30: 30: 31: 31: 31: 31: 31: 32: 32: 32: 32: 32: 33: 33: 33: 33: 33: 34: 34: 34: 34: 34: 35: 35: 35: 35: 35: 36: 36: 36: 36: 36: 37: 37: 37: 37: 37: 38: 38: 38: 38: 38: 39: 39: 39: 39: 39: 40: 40: 40: 40: 40: 41: 41: 41: 41: 41: 42: 42: 42: 42: 42: 43: 43: 43: 43: 43: 44: 44: 44: 44: 44: 45: 45: 45: 45: 45: 46: 46: 46: 46: 46: 47: 47: 47: 47: 47: 48: 48: 48: 48: 48: 49: 49: 49: 49: 49: 50: 50: 50: 50: 50: 51: 51: 51: 51: 51: 52: 52: 52

350: 350: 350: 351: 351: 351: 351: 351: 352: 352: 352: 352: 352: 353: 353: 353: 353: 353: 354: 354: 354: 354: 354: 355: 355: 355: 355: 355: 356: 356: 356: 356: 356: 357: 357: 357: 357: 357: 358: 358: 358: 358: 358: 359: 359: 359: 359: 359: 360: 360: 360: 360: 360: 361: 361: 361: 361: 361: 362: 362: 362: 362: 362: 363: 363: 363: 363: 363: 364: 364: 364: 364: 364: 365: 365: 365: 365: 365: 366: 366: 366: 366: 366: 367: 367: 367: 367: 367: 368: 368: 368: 368: 368: 369: 369: 369: 369: 369: 370: 370: 370: 370: 370: 371: 371: 371: 371: 371: 372: 372: 372: 372: 372: 373: 373: 373: 373: 373: 374: 374: 374: 374: 374: 375: 375: 375: 375: 375: 376: 376: 376: 376: 376: 377: 377: 377: 377: 377: 378: 378: 378: 378: 378: 379: 379: 379: 379: 379: 380: 380: 380: 380: 380: 381: 381: 381: 381: 381: 382: 382: 382: 382: 382: 383: 383: 383: 383: 383: 384: 384: 384: 384: 384: 385: 385: 385: 385: 385: 386: 386: 386: 386: 386: 387: 387: 387: 387: 387: 388: 388: 388: 388: 388: 389: 389: 389: 389: 389: 390: 390: 

679: 679: 679: 679: 680: 680: 680: 680: 680: 681: 681: 681: 681: 681: 682: 682: 682: 682: 682: 683: 683: 683: 683: 683: 684: 684: 684: 684: 684: 685: 685: 685: 685: 685: 686: 686: 686: 686: 686: 687: 687: 687: 687: 687: 688: 688: 688: 688: 688: 689: 689: 689: 689: 689: 690: 690: 690: 690: 690: 691: 691: 691: 691: 691: 692: 692: 692: 692: 692: 693: 693: 693: 693: 693: 694: 694: 694: 694: 694: 695: 695: 695: 695: 695: 696: 696: 696: 696: 696: 697: 697: 697: 697: 697: 698: 698: 698: 698: 698: 699: 699: 699: 699: 699: 700: 700: 700: 700: 700: 701: 701: 701: 701: 701: 702: 702: 702: 702: 702: 703: 703: 703: 703: 703: 704: 704: 704: 704: 704: 705: 705: 705: 705: 705: 706: 706: 706: 706: 706: 707: 707: 707: 707: 707: 708: 708: 708: 708: 708: 709: 709: 709: 709: 709: 710: 710: 710: 710: 710: 711: 711: 711: 711: 711: 712: 712: 712: 712: 712: 713: 713: 713: 713: 713: 714: 714: 714: 714: 714: 715: 715: 715: 715: 715: 716: 716: 716: 716: 716: 717: 717: 717: 717: 717: 718: 718: 718: 718: 718: 719: 

In [240]:
ctb_cv

defaultdict(list,
            {'Accuracy_test_avg': [0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
              0.9377479663193948,
         

In [241]:
best_round = ctb_cv['Accuracy_test_avg'].index(np.max(ctb_cv['Accuracy_test_avg']))

In [243]:
best_round

0

In [244]:
model = CatBoostClassifier(iterations=100, learning_rate=0.03,rsm = 0.3 ,depth=6, eval_metric='Accuracy', random_seed=2017)

In [245]:
model.fit(my_dt)

0: learn: 0.9376962	total: 46.9ms	remaining: 4.64s
1: learn: 0.9376962	total: 82.7ms	remaining: 4.05s
2: learn: 0.9377248	total: 109ms	remaining: 3.53s
3: learn: 0.9376962	total: 141ms	remaining: 3.39s
4: learn: 0.9376962	total: 169ms	remaining: 3.22s
5: learn: 0.9376962	total: 192ms	remaining: 3.01s
6: learn: 0.9376962	total: 209ms	remaining: 2.78s
7: learn: 0.9377248	total: 236ms	remaining: 2.71s
8: learn: 0.9377248	total: 262ms	remaining: 2.65s
9: learn: 0.9377248	total: 292ms	remaining: 2.63s
10: learn: 0.9377248	total: 314ms	remaining: 2.54s
11: learn: 0.9377248	total: 342ms	remaining: 2.51s
12: learn: 0.9377248	total: 360ms	remaining: 2.41s
13: learn: 0.9377248	total: 384ms	remaining: 2.36s
14: learn: 0.9377248	total: 412ms	remaining: 2.33s
15: learn: 0.9377248	total: 445ms	remaining: 2.34s
16: learn: 0.9377248	total: 463ms	remaining: 2.26s
17: learn: 0.9377248	total: 484ms	remaining: 2.2s
18: learn: 0.9377248	total: 504ms	remaining: 2.15s
19: learn: 0.9377248	total: 519ms	remain

<catboost.core.CatBoostClassifier at 0x7fb6b52e7d30>

In [246]:
preds = model.predict(cxval)

In [247]:
accuracy_score(preds, cyval)

0.89445300462249611

## Entity Embedding

In [188]:
models = []

In [189]:
model_browser = Sequential()
model_browser.add(Embedding(11, 7, input_length=1))
model_browser.add(Reshape(target_shape=(7,)))
models.append(model_browser)

In [190]:
model_device = Sequential()
model_device.add(Embedding(11, 2, input_length=1))
model_device.add(Reshape(target_shape=(2,)))
models.append(model_device)

In [191]:
model_text = Sequential()
model_text.add(Dense(1, input_dim=1))
models.append(model_text)

In [206]:
fmodel = Sequential()
fmodel.add(Merge(models, mode='concat'))
fmodel.add(Dense(512, activation='relu'))
fmodel.add(Dense(256, activation='relu'))
fmodel.add(Dense(1))
fmodel.add(Activation('sigmoid'))
fmodel.compile(loss='binary_crossentropy', optimizer='adam', 
                  metrics=['accuracy'])
fmodel.summary()

  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_10 (Merge)             (None, 10)                0         
_________________________________________________________________
dense_53 (Dense)             (None, 512)               5632      
_________________________________________________________________
dense_54 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_55 (Dense)             (None, 1)                 257       
_________________________________________________________________
activation_17 (Activation)   (None, 1)                 0         
Total params: 137,318
Trainable params: 137,318
Non-trainable params: 0
_________________________________________________________________


In [193]:
lxtrain_ = lxtrain.as_matrix()
lxval_ = lxval.as_matrix()
lytrain_ = lytrain.as_matrix()
lyval_ = lyval.as_matrix()

In [194]:
lytrain_[:10]

array([1, 0, 0, 1, 1, 1, 1, 1, 0, 0])

In [195]:
lxtrain_ = [lxtrain['Predicted'].as_matrix(), lxtrain['Browser_Val'].as_matrix(), 
            lxtrain['Device_Val'].as_matrix()]
lxval_ = [lxval['Predicted'].as_matrix(), lxval['Browser_Val'].as_matrix(), 
          lxval['Device_Val'].as_matrix()]
lytrain_ = lytrain.as_matrix()
lyval_ = lyval.as_matrix()

In [196]:
lxtrain_[0].shape

(35038,)

In [207]:
fpath = 'entity-embedding/entity-embedding-6-{epoch:02d}-{val_acc:.3f}.h5py'
checkpoint = ModelCheckpoint(filepath=fpath, verbose=1, 
                             monitor='val_acc', save_best_only=True)
fmodel.fit(lxtrain_, lytrain_, validation_data=(lxval_, lyval_), 
          callbacks=[checkpoint], batch_size=64, epochs=100)

Train on 35038 samples, validate on 3894 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

KeyboardInterrupt: 