In [1]:
cd C:\Users\HP-15\Documents\Hotel_reviews

C:\Users\HP-15\Documents\Hotel_reviews


In [2]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [3]:
import pandas as pd

In [4]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [6]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [8]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [9]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)


In [10]:

# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [11]:

# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [12]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())


In [13]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [14]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [15]:

# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [16]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [18]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

In [19]:


mod1 = GaussianNB()
target = train_feats['Is_Response']

In [20]:
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.77208526  0.76110968  0.76753147  0.76663242  0.77626509]


In [21]:
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.80906523  0.81518109  0.80901618  0.81312612  0.80349345]


In [22]:


clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

GaussianNB(priors=None)

In [23]:
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [24]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [25]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

In [26]:
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))

In [27]:
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [31]:
## write submission files
sub1.to_csv('sub1_cv.csv', index=False)
sub2.to_csv('sub2_tf.csv', index=False)

In [32]:
import lightgbm as lgb

In [33]:

d_train = lgb.Dataset(train_feats1, label = target)

In [34]:
# set parameters


params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [35]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.200426 + 0.00433169
[40]	cv_agg's binary_error: 0.1826 + 0.00342877
[60]	cv_agg's binary_error: 0.169192 + 0.00419929
[80]	cv_agg's binary_error: 0.160125 + 0.00294213
[100]	cv_agg's binary_error: 0.153575 + 0.00349728
[120]	cv_agg's binary_error: 0.148875 + 0.00342645
[140]	cv_agg's binary_error: 0.144919 + 0.00272859
[160]	cv_agg's binary_error: 0.14122 + 0.0037417
[180]	cv_agg's binary_error: 0.138061 + 0.00322003
[200]	cv_agg's binary_error: 0.136289 + 0.00321441
[220]	cv_agg's binary_error: 0.134722 + 0.00311667
[240]	cv_agg's binary_error: 0.133283 + 0.00291104
[260]	cv_agg's binary_error: 0.131794 + 0.00306888
[280]	cv_agg's binary_error: 0.130715 + 0.00294311
[300]	cv_agg's binary_error: 0.129482 + 0.00314257
[320]	cv_agg's binary_error: 0.128865 + 0.00330403
[340]	cv_agg's binary_error: 0.128506 + 0.00342022
[360]	cv_agg's binary_error: 0.127864 + 0.00359088
[380]	cv_agg's binary_error: 0.127016 + 0.00329119
[400]	cv_agg's binary_error: 0.126913 +

In [36]:
# get nround value
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [37]:

# train model
model = lgb.train(params, d_train, num_boost_round=nround)

In [40]:
# make prediction
preds = model.predict(test_feats2)


In [41]:


def to_labels(x):
    if x > 0.66:
        return "happy"
    return "not_happy"

sub4 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))
sub4 = sub4[['User_ID','Is_Response']]
sub4.to_csv('sub4_lgb.csv', index=False) # 0.84925