In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from keras.utils.np_utils import to_categorical

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense,Dropout,GRU,Masking
from keras.models import Model
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from gensim.models import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix



Using TensorFlow backend.


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
tot_data = pd.concat([train, test])

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [5]:
other_data=tot_data[['Browser_Used','Device_Used']]

In [6]:
other_data.head()

Unnamed: 0,Browser_Used,Device_Used
0,Edge,Mobile
1,Internet Explorer,Mobile
2,Mozilla,Tablet
3,InternetExplorer,Desktop
4,Edge,Tablet


In [7]:
other_data.Browser_Used.value_counts()

Firefox              13043
Edge                 12437
InternetExplorer      8191
Google Chrome         8050
Mozilla Firefox       7526
Mozilla               5425
Chrome                4356
IE                    4270
Internet Explorer     3700
Safari                 670
Opera                  668
Name: Browser_Used, dtype: int64

In [8]:
other_data['Browser_Used']=other_data.Browser_Used.replace(['Mozilla Firefox','Mozilla'],'Firefox')
other_data['Browser_Used']=other_data.Browser_Used.replace(['IE','Internet Explorer'],'InternetExplorer')
other_data['Browser_Used']=other_data.Browser_Used.replace('Chrome','Google Chrome')

In [9]:
other_data.Browser_Used.value_counts()

Firefox             25994
InternetExplorer    16161
Edge                12437
Google Chrome       12406
Safari                670
Opera                 668
Name: Browser_Used, dtype: int64

In [10]:
other_data.Device_Used.value_counts()

Desktop    26375
Mobile     26214
Tablet     15747
Name: Device_Used, dtype: int64

In [11]:
other_data=pd.get_dummies(other_data)

In [12]:
tot_data.drop('User_ID', inplace=True, axis=1)
tot_data.drop('Browser_Used', inplace=True, axis=1)
tot_data.drop('Device_Used', inplace=True, axis=1)
tot_data.drop('Is_Response', inplace=True, axis=1)
tot_data.head()

Unnamed: 0,Description
0,The room was kind of clean but had a VERY stro...
1,I stayed at the Crown Plaza April -- - April -...
2,I booked this hotel through Hotwire at the low...
3,Stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...


In [13]:
target=train['Is_Response']

In [14]:
#Converting text to lower case
tot_data = np.array(tot_data['Description'])
for i in range(0,len(tot_data)):
    tot_data[i] = tot_data[i].lower()

In [15]:
#Tokenizing text
tokenizer = RegexpTokenizer(r'\w+')
for i in range(0,len(tot_data)):
    tot_data[i] = tokenizer.tokenize(tot_data[i])

In [16]:
#Stop word removal
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))

In [17]:
tot_data[0]

['the',
 'room',
 'was',
 'kind',
 'of',
 'clean',
 'but',
 'had',
 'a',
 'very',
 'strong',
 'smell',
 'of',
 'dogs',
 'generally',
 'below',
 'average',
 'but',
 'ok',
 'for',
 'a',
 'overnight',
 'stay',
 'if',
 'you',
 're',
 'not',
 'too',
 'fussy',
 'would',
 'consider',
 'staying',
 'again',
 'if',
 'the',
 'price',
 'was',
 'right',
 'breakfast',
 'was',
 'free',
 'and',
 'just',
 'about',
 'better',
 'than',
 'nothing']

In [18]:
for i in range(0,len(tot_data)):
    tot_data[i]=" ".join(token for token in tot_data[i] if token not in stop)

In [19]:
tokenizer = RegexpTokenizer(r'\w+')
for i in range(0,len(tot_data)):
    tot_data[i] = tokenizer.tokenize(tot_data[i])

In [20]:
tot_data[0]

['room',
 'kind',
 'clean',
 'strong',
 'smell',
 'dogs',
 'generally',
 'average',
 'ok',
 'overnight',
 'stay',
 'fussy',
 'would',
 'consider',
 'staying',
 'price',
 'right',
 'breakfast',
 'free',
 'better',
 'nothing']

In [21]:
#Word2Vec model
model_wv = Word2Vec(tot_data,size =100,window = 5,min_count =1)

In [22]:
tot_seq=np.zeros([68336,400,100])

In [23]:
b = np.zeros([100])
for i in range(0,68336):
    a = model_wv.wv[tot_data[i]]
    if(len(a) >400):
        a = a[0:400]
    for j in range(0,(400-len(a))):
        a = np.vstack([a,b])
    tot_seq[i] = a

In [25]:
del model_wv
del a 
del b
del tot_data

In [26]:
np.shape(tot_seq)

(68336, 400, 100)

In [27]:
target = pd.get_dummies(target)
target=target['happy']

In [28]:
train_dat = tot_seq[0:38932]
test_dat = tot_seq[38932:]

In [32]:
np.shape(train_dat)

(38932, 400, 100)

In [33]:
#LSTM model
main_input = Input(shape = (400,100))
mask=Masking(mask_value=0.0)(main_input)

lstm_out = LSTM(256)(mask)
out=Dense(64)(lstm_out)
output = Dense(1,activation = 'sigmoid')(out)


In [34]:
model = Model(inputs=main_input,outputs=output)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 400, 100)          0         
_________________________________________________________________
masking_1 (Masking)          (None, 400, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 382,081
Trainable params: 382,081
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(train_dat,target,batch_size=64,epochs= 4,validation_split=0.2)

Train on 31145 samples, validate on 7787 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x129b86320>

In [36]:
#Model to extract output from last but one layer
main_input2 = Input(shape = (400,100))
mask2=Masking(mask_value=0.0)(main_input2)

lstm_out2 = LSTM(256,weights = model.layers[2].get_weights())(mask2)
out2=Dense(64,weights = model.layers[3].get_weights())(lstm_out2)

model2 = Model(inputs=main_input2,outputs=out2)

In [37]:
train_transformed = model2.predict(train_dat)
test_transformed = model2.predict(test_dat)

In [38]:
#Concatenating output from last but layer and other_data that is provided apart from review.
other_data=np.array(other_data)
other_data_train=other_data[0:38932]
other_data_test=other_data[38932:]

train_transformed = np.concatenate([train_transformed,other_data_train],axis=1)

from sklearn.model_selection import train_test_split
train_new,valid_new,y_train_new,y_valid_new=train_test_split(train_transformed,target,test_size=0.2)

In [39]:
np.shape(train_new)

(31145, 73)

In [40]:
#Feed Forward Neural on total data
main_input3 = Input(shape = (73,))
drop0 = Dropout(0.2)(main_input3)
deninp1=Dense(100)(drop0)
drop1 = Dropout(0.5)(deninp1)
deninp2=Dense(20)(drop1)
output3 = Dense(1,activation = 'sigmoid')(deninp2)


model3 = Model(inputs=main_input3,outputs=output3)
model3.compile('ADAM', 'binary_crossentropy', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 73)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 73)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               7400      
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                2020      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 21        
Total params: 9,441
Trainable params: 9,441
Non-trainable params: 0
_________________________________________________________________


In [42]:
model3.fit(train_new,y_train_new,batch_size=64,epochs= 100,validation_data=[valid_new,y_valid_new])

Train on 31145 samples, validate on 7787 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x124329cc0>

In [44]:
test_transformed = np.concatenate([test_transformed,other_data_test],axis=1)

In [45]:
predictions=model3.predict(test_transformed)

In [47]:
np.save("Predictions.npy",predictions)