In [341]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [342]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [343]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [344]:

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [345]:
PATH = "data/Predict_happiness/"

In [346]:
def get_combined():
    df_train = pd.read_csv(f'{PATH}train.csv', low_memory=False)
    df_test=pd.read_csv(f'{PATH}test.csv', low_memory=False)
    df_test['Is_Response'] = np.nan
    combined= df_train.append(df_test)
    combined.reset_index(inplace=True)
    combined.drop(['index','User_ID'],inplace=True,axis=1)
    return combined

In [347]:
combined_df= get_combined()

In [348]:
print(combined_df.shape)

(68336, 4)


In [349]:
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68336 entries, 0 to 68335
Data columns (total 4 columns):
Description     68336 non-null object
Browser_Used    68336 non-null object
Device_Used     68336 non-null object
Is_Response     38932 non-null object
dtypes: object(4)
memory usage: 2.1+ MB
None


In [350]:
print(combined_df.dtypes)

Description     object
Browser_Used    object
Device_Used     object
Is_Response     object
dtype: object


In [351]:
combined_df_copy=combined_df

In [352]:
combined_df_copy.shape

(68336, 4)

In [355]:
combined_df['Browser_Used'].value_counts()

Mozilla_FireFox      25994
Internet_Explorer    16161
Edge                 12437
Google_Chrome        12406
Safari                 670
Opera                  668
Name: Browser_Used, dtype: int64

In [354]:
def clean_title(raw):
    if ('Chrome' in raw) or ('Google Chrome' in raw):
        return 'Google_Chrome'
    elif ('InternetExplorer' in raw) or ('IE' in raw) or ('Internet Explorer' in raw):
        return 'Internet_Explorer'
    elif ('Mozilla' in raw) or ('Firefox' in raw) or ('Mozilla Firefox' in raw):
        return 'Mozilla_FireFox'
    elif 'Edge' in raw:
        return 'Edge'
    elif 'Opera' in raw:
        return 'Opera'
    elif 'Safari' in raw :
        return 'Safari'
    else :
        return raw
    
combined_df['Browser_Used'] = combined_df.Browser_Used.apply(clean_title)

In [356]:
targets=combined_df['Is_Response'].map({'not happy':0,'happy':1})

In [357]:
combined_df.drop(['Description','Is_Response'],axis=1,inplace=True)

In [358]:
combined_df.head()

Unnamed: 0,Browser_Used,Device_Used
0,Edge,Mobile
1,Internet_Explorer,Mobile
2,Mozilla_FireFox,Tablet
3,Internet_Explorer,Desktop
4,Edge,Tablet


In [359]:
train_cats(combined_df)

In [360]:
combined_df.dtypes

Browser_Used    category
Device_Used     category
dtype: object

In [361]:
combined_df= pd.concat([combined_df,targets],axis=1)

In [362]:
df_tr, y_tr, nas = proc_df(combined_df,'Is_Response',max_n_cat=7)
print(df_tr.shape,y_tr.shape)

(68336, 11) (68336,)


In [363]:
df_tr.head()

Unnamed: 0,Browser_Used_Edge,Browser_Used_Google_Chrome,Browser_Used_Internet_Explorer,Browser_Used_Mozilla_FireFox,Browser_Used_Opera,Browser_Used_Safari,Browser_Used_nan,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,Device_Used_nan
0,1,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,1,0


In [364]:
y_tr

array([ 0.,  0.,  0., ..., nan, nan, nan])

In [365]:
combined_df_new= get_combined()

In [366]:
print(combined_df_new.shape)

(68336, 4)


In [367]:
# transformed the  Description  into lower case.
combined_df_new['Description'] = combined_df_new['Description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df_new['Description'].head()

0    the room was kind of clean but had a very stro...
1    i stayed at the crown plaza april -- - april -...
2    i booked this hotel through hotwire at the low...
3    stayed here with husband and sons on the way t...
4    my girlfriends and i stayed here to celebrate ...
Name: Description, dtype: object

In [368]:
 #Remove punctuation, as it doesn’t add any extra information while treating text data
combined_df_new['Description'] = combined_df_new['Description'].str.replace('[^\w\s]','')
combined_df_new['Description'].tail(10)

68326    this hotel may not be the most glamorous in sf...
68327    stayed here for  nights  had a room overlookin...
68328    we stayed for three nights june  the location ...
68329    we spent  nights in a king castia the beds are...
68330    we booked our flighthotel package on expedia a...
68331    i stayed at the hotel and towers for a confere...
68332    trying to stay within the marriott family and ...
68333    we stayed for  nights with our little dogvery ...
68334    stayed at the yotel over the weekend and was v...
68335    the blakely is is comfortable is every way the...
Name: Description, dtype: object

In [369]:
# Removal of stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
combined_df_new['Description'] = combined_df_new['Description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
combined_df_new['Description'].head()


0    room kind clean strong smell dogs generally av...
1    stayed crown plaza april april staff friendly ...
2    booked hotel hotwire lowest price could find g...
3    stayed husband sons way alaska cruise loved ho...
4    girlfriends stayed celebrate th birthdays plan...
Name: Description, dtype: object

In [370]:
# Most frequent words in our text
freq = pd.Series(' '.join(combined_df_new['Description']).split()).value_counts()[:30]
freq

hotel        124948
room         108113
stay          46365
great         44817
staff         42681
would         38717
rooms         35609
good          33094
one           33016
nice          32367
location      32350
stayed        29465
us            28008
clean         27816
night         27211
service       25523
breakfast     24087
get           23568
time          21954
also          21178
desk          20040
like          19261
bed           18855
friendly      18561
day           18518
could         18350
area          18276
well          18090
place         17920
small         17596
dtype: int64

In [371]:
# Rare word removal
freq_rare = pd.Series(' '.join(combined_df_new['Description']).split()).value_counts()[-10:]

In [372]:
print(freq_rare)

farias           1
hypoalergenic    1
openlayout       1
thisno           1
uhhokay          1
thius            1
productand       1
rockefellow      1
highpoints       1
mishapsthere     1
dtype: int64


In [373]:
freq_rare= list(freq_rare.index)
combined_df_new['Description'] = combined_df_new['Description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_rare))


In [374]:
# Lemmatization-  it converts the word into its root word, rather than just stripping the suffices.
from textblob import Word
combined_df_new['Description'] = combined_df_new['Description'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
combined_df_new['Description'].head(20)

0     room kind clean strong smell dog generally ave...
1     stayed crown plaza april april staff friendly ...
2     booked hotel hotwire lowest price could find g...
3     stayed husband son way alaska cruise loved hot...
4     girlfriend stayed celebrate th birthday planne...
5     room one nice clearly updated recently clean b...
6     husband stayed hotel time though fanciest hote...
7     wife stayed glorious city back sf expensive fo...
8     boyfriend stayed fairmont recent trip san fran...
9     wonderful staff great location definately pric...
10    step time square nice room stayed night great ...
11    wife kid stayed valentine weekend really nice ...
12    stay jolly madison xmas period main feature lo...
13    highly recommend hawthorne terrace affordable ...
14    found hotel clean nicely located good free shu...
15    stayed elan th th october liked much returned ...
16    priceline sent u hotel accepting bid usd night...
17    old cheap furnituresour chair simply destr

In [377]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=350, ngram_range=(1,1), min_df=150,analyzer = "word")
train_bow = bow.fit_transform(combined_df_new['Description'])
train_bow


<68336x350 sparse matrix of type '<class 'numpy.int64'>'
	with 2365578 stored elements in Compressed Sparse Row format>

In [None]:
#tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [376]:
combined_df_new['Description'][:5]

0    room kind clean strong smell dog generally ave...
1    stayed crown plaza april april staff friendly ...
2    booked hotel hotwire lowest price could find g...
3    stayed husband son way alaska cruise loved hot...
4    girlfriend stayed celebrate th birthday planne...
Name: Description, dtype: object

In [316]:
df_tr.shape,y_tr.shape

((68336, 11), (68336,))

In [252]:
y_tr

array([ 0.,  0.,  0., ..., nan, nan, nan])

In [385]:
df_tr=pd.concat([df_tr,description],axis=1)

In [386]:
df_tr.head()

Unnamed: 0,Browser_Used_Edge,Browser_Used_Google_Chrome,Browser_Used_Internet_Explorer,Browser_Used_Mozilla_FireFox,Browser_Used_Opera,Browser_Used_Safari,Browser_Used_nan,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,Device_Used_nan,Description
0,1,0,0,0,0,0,0,0,1,0,0,room kind clean strong smell dog generally ave...
1,0,0,1,0,0,0,0,0,1,0,0,stayed crown plaza april april staff friendly ...
2,0,0,0,1,0,0,0,0,0,1,0,booked hotel hotwire lowest price could find g...
3,0,0,1,0,0,0,0,1,0,0,0,stayed husband son way alaska cruise loved hot...
4,1,0,0,0,0,0,0,0,0,1,0,girlfriend stayed celebrate th birthday planne...


In [384]:
description=combined_df_new['Description']

In [378]:
bow_df = pd.DataFrame(train_bow.todense())

In [379]:
bow_df[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,340,341,342,343,344,345,346,347,348,349
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0
2,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,3,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [380]:
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]

In [381]:
bow_df.columns

Index(['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
       'col9',
       ...
       'col340', 'col341', 'col342', 'col343', 'col344', 'col345', 'col346',
       'col347', 'col348', 'col349'],
      dtype='object', length=350)

In [382]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:38932]
bow_df_test = bow_df[38932:]

In [383]:
bow_df_train.shape,bow_df_test.shape

((38932, 350), (29404, 350))

In [387]:
Response=combined_df_new['Is_Response']

In [388]:
df_tr=pd.concat([df_tr,Response],axis=1,inplace=True)

In [390]:
df_tr.drop(['Description'],axis=1,inplace=True)

In [391]:
df_tr.head()

Unnamed: 0,Browser_Used_Edge,Browser_Used_Google_Chrome,Browser_Used_Internet_Explorer,Browser_Used_Mozilla_FireFox,Browser_Used_Opera,Browser_Used_Safari,Browser_Used_nan,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,Device_Used_nan,Is_Response
0,1,0,0,0,0,0,0,0,1,0,0,not happy
1,0,0,1,0,0,0,0,0,1,0,0,not happy
2,0,0,0,1,0,0,0,0,0,1,0,not happy
3,0,0,1,0,0,0,0,1,0,0,0,happy
4,1,0,0,0,0,0,0,0,0,1,0,not happy


In [472]:
train_feats = df_tr[~pd.isnull(df_tr.Is_Response)]
test_feats = df_tr[pd.isnull(df_tr.Is_Response)]

In [473]:
test_feats.head()

Unnamed: 0,Browser_Used_Edge,Browser_Used_Google_Chrome,Browser_Used_Internet_Explorer,Browser_Used_Mozilla_FireFox,Browser_Used_Opera,Browser_Used_Safari,Browser_Used_nan,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,Device_Used_nan,Is_Response
38932,0,0,0,1,0,0,0,0,1,0,0,
38933,0,0,1,0,0,0,0,1,0,0,0,
38934,0,0,1,0,0,0,0,0,0,1,0,
38935,1,0,0,0,0,0,0,0,1,0,0,
38936,0,0,0,1,0,0,0,0,1,0,0,


In [474]:
### set target variable
train_feats['Is_Response']=train_feats['Is_Response'] .map({'not happy':0,'happy':1})
#train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [475]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats, bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats, bow_df_test], axis=1)
#test_feats1.reset_index(drop=True, inplace=True)

In [476]:
train_feats1['Is_Response'][:20]

0     0
1     0
2     0
3     1
4     0
5     1
6     0
7     1
8     1
9     0
10    1
11    1
12    0
13    1
14    1
15    1
16    0
17    0
18    1
19    0
Name: Is_Response, dtype: int64

In [477]:
test_feats1['Is_Response'][:20]

38932    NaN
38933    NaN
38934    NaN
38935    NaN
38936    NaN
38937    NaN
38938    NaN
38939    NaN
38940    NaN
38941    NaN
38942    NaN
38943    NaN
38944    NaN
38945    NaN
38946    NaN
38947    NaN
38948    NaN
38949    NaN
38950    NaN
38951    NaN
Name: Is_Response, dtype: object

In [446]:
#test_feats1.reset_index(drop=True, inplace=True)

In [478]:
train_feats1.shape,test_feats1.shape

((38932, 362), (29404, 362))

In [479]:
test_feats1.head()

Unnamed: 0,Browser_Used_Edge,Browser_Used_Google_Chrome,Browser_Used_Internet_Explorer,Browser_Used_Mozilla_FireFox,Browser_Used_Opera,Browser_Used_Safari,Browser_Used_nan,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,...,col340,col341,col342,col343,col344,col345,col346,col347,col348,col349
38932,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38933,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
38934,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
38935,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,3,0,0,0
38936,0,0,0,1,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,0


In [480]:
#X=train_feats1.drop(['Is_Response'],axis=1,inplace=True)
y=train_feats1['Is_Response']

In [481]:
train_feats1.drop(['Is_Response'],axis=1,inplace=True)

In [485]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_feats1, y, test_size=0.4, random_state=4)

In [484]:
mod1 = GaussianNB()
print(cross_val_score(mod1, train_feats1,y, cv=10, scoring='accuracy').mean())

0.7619690455235971


In [None]:
y_pred=m.predict(X_test)

In [486]:
clf1 = GaussianNB()
clf1.fit(X_train,y_train)

GaussianNB(priors=None)

In [488]:
y_pred=clf1.predict(X_test)

In [490]:
print(y_pred)
print(metrics.accuracy_score(y_test, y_pred))

[1 0 0 ... 1 1 1]
0.758042766326334


In [492]:
print('True:',y_test[0:10])
print('Pred:',y_pred[0:10])

True: 34408    0
23154    1
27772    0
22107    1
26000    0
12848    1
5256     1
29486    1
29247    0
4963     1
Name: Is_Response, dtype: int64
Pred: [1 0 0 1 1 1 1 1 0 1]


In [493]:
print(metrics.confusion_matrix(y_test, y_pred))

[[2682 2342]
 [1426 9123]]


In [494]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,y_pred)))
print('Precision score: ', format(precision_score(y_test,y_pred)))
print('Recall score: ', format(recall_score(y_test,y_pred)))
print('F1 score: ', format(f1_score(y_test,y_pred)))

Accuracy score:  0.758042766326334
Precision score:  0.7957261229829917
Recall score:  0.8648213100767845
F1 score:  0.8288361951485419


In [495]:
test_feats1.shape

(29404, 362)

In [497]:
resp=test_feats1['Is_Response']

In [None]:
test_feats1.drop(['Is_Response'],axis=1,inplace=True)

In [502]:
y_predicted_test=clf1.predict(test_feats1)

In [505]:
print(y_predicted_test)

[0 1 0 ... 1 1 1]


In [506]:
df_test=pd.read_csv(f'{PATH}test.csv', low_memory=False)

In [512]:
df_test.shape

(29404, 4)

In [508]:

sub1 = pd.DataFrame({'User_ID':df_test.User_ID, 'Is_Response':y_predicted_test})

In [510]:
print(sub1.shape)

(29404, 2)


In [509]:
print(sub1)

       Is_Response   User_ID
0                0   id80132
1                1   id80133
2                0   id80134
3                0   id80135
4                1   id80136
5                1   id80137
6                1   id80138
7                0   id80139
8                1   id80140
9                1   id80141
10               1   id80142
11               1   id80143
12               0   id80144
13               1   id80145
14               1   id80146
15               1   id80147
16               1   id80148
17               1   id80149
18               1   id80150
19               1   id80151
20               1   id80152
21               1   id80153
22               1   id80154
23               1   id80155
24               0   id80156
25               0   id80157
26               1   id80158
27               0   id80159
28               1   id80160
29               0   id80161
...            ...       ...
29374            1  id109506
29375            0  id109507
29376         

In [513]:
def labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [514]:
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: labels(x))

In [516]:
print(sub1.head(10))

  Is_Response  User_ID
0   not_happy  id80132
1       happy  id80133
2   not_happy  id80134
3   not_happy  id80135
4       happy  id80136
5       happy  id80137
6       happy  id80138
7   not_happy  id80139
8       happy  id80140
9       happy  id80141
