In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from textblob import TextBlob
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
train = pd.read_csv('/home/priya/Downloads/PredictTheHappiness/train.csv')
test = pd.read_csv('/home/priya/Downloads/PredictTheHappiness/test.csv')

In [3]:
test['Is_Response'] =  np.nan
totaldata = pd.concat([train, test]).reset_index(drop=True)

In [4]:
totaldata['word_count'] = totaldata['Description'].apply(lambda x: len(str(x).split(" ")))
totaldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,word_count
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,46
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,208
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,229
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,93
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,294


In [5]:
totaldata['char_count'] = totaldata['Description'].str.len() ## this also includes spaces
totaldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,word_count,char_count
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,46,248
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,208,1077
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,229,1327
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,93,502
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,294,1613


In [6]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

totaldata['avg_word'] = totaldata['Description'].apply(lambda x: avg_word(x))
totaldata[['Description','avg_word']].head()

Unnamed: 0,Description,avg_word
0,The room was kind of clean but had a VERY stro...,4.413043
1,I stayed at the Crown Plaza April -- - April -...,4.182692
2,I booked this hotel through Hotwire at the low...,4.724138
3,Stayed here with husband and sons on the way t...,4.408602
4,My girlfriends and I stayed here to celebrate ...,4.434343


In [7]:
nltk.download('stopwords')  

[nltk_data] Downloading package stopwords to /home/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop = stopwords.words('english')

totaldata['stopwords'] = totaldata['Description'].apply(lambda x: len([x for x in x.split() if x in stop]))
totaldata[['Description','stopwords']].head()

Unnamed: 0,Description,stopwords
0,The room was kind of clean but had a VERY stro...,23
1,I stayed at the Crown Plaza April -- - April -...,82
2,I booked this hotel through Hotwire at the low...,91
3,Stayed here with husband and sons on the way t...,36
4,My girlfriends and I stayed here to celebrate ...,127


In [9]:
totaldata['upper'] = totaldata['Description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
totaldata[['Description','upper']].head()

Unnamed: 0,Description,upper
0,The room was kind of clean but had a VERY stro...,1
1,I stayed at the Crown Plaza April -- - April -...,8
2,I booked this hotel through Hotwire at the low...,9
3,Stayed here with husband and sons on the way t...,0
4,My girlfriends and I stayed here to celebrate ...,7


In [10]:
totaldata['Description'] = totaldata['Description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
totaldata['Description'].head()

0    the room was kind of clean but had a very stro...
1    i stayed at the crown plaza april -- - april -...
2    i booked this hotel through hotwire at the low...
3    stayed here with husband and sons on the way t...
4    my girlfriends and i stayed here to celebrate ...
Name: Description, dtype: object

In [11]:
totaldata['Description'] = totaldata['Description'].str.replace('[^\w\s]','')
totaldata['Description'].head()

0    the room was kind of clean but had a very stro...
1    i stayed at the crown plaza april   april   th...
2    i booked this hotel through hotwire at the low...
3    stayed here with husband and sons on the way t...
4    my girlfriends and i stayed here to celebrate ...
Name: Description, dtype: object

In [12]:
stop = stopwords.words('english')
totaldata['Description'] = totaldata['Description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
totaldata['Description'].head()

0    room kind clean strong smell dogs generally av...
1    stayed crown plaza april april staff friendly ...
2    booked hotel hotwire lowest price could find g...
3    stayed husband sons way alaska cruise loved ho...
4    girlfriends stayed celebrate th birthdays plan...
Name: Description, dtype: object

In [13]:
totaldata['Description'] = totaldata['Description'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
totaldata['Description'].head()

0    room kind clean strong smell dog generally ave...
1    stayed crown plaza april april staff friendly ...
2    booked hotel hotwire lowest price could find g...
3    stayed husband son way alaska cruise loved hot...
4    girlfriend stayed celebrate th birthday planne...
Name: Description, dtype: object

In [14]:
totaldata['sentiment'] = totaldata['Description'].apply(lambda x: TextBlob(x).sentiment[0] )
totaldata[['Description','sentiment']].head()

Unnamed: 0,Description,sentiment
0,room kind clean strong smell dog generally ave...,0.366964
1,stayed crown plaza april april staff friendly ...,0.082074
2,booked hotel hotwire lowest price could find g...,0.142882
3,stayed husband son way alaska cruise loved hot...,0.696
4,girlfriend stayed celebrate th birthday planne...,0.154574


In [15]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [16]:
# create features
bagofwords = countvec.fit_transform(totaldata['Description'])
tfidfdata = tfidfvec.fit_transform(totaldata['Description'])

In [17]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [18]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    totaldata[x] = lbl.fit_transform(totaldata[x])

In [19]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [20]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [21]:
# split the merged data file into train and test respectively
train_feats = totaldata[~pd.isnull(totaldata.Is_Response)]
test_feats = totaldata[pd.isnull(totaldata.Is_Response)]

In [22]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [24]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

In [25]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data

mod1 = GaussianNB()
target = train_feats['Is_Response']

In [26]:
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[0.78094504 0.76714616 0.77061392 0.77446699 0.7806319 ]


In [27]:
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[0.80829481 0.80850244 0.80387876 0.8079887  0.80644747]


In [28]:
# make our first set of predictions

clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

GaussianNB(priors=None)

In [29]:
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [30]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [31]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

In [32]:
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))

In [33]:
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [34]:
## write submission files
sub1.to_csv('/home/priya/Downloads/PredictTheHappiness/submissions/sub1_cv.csv', index=False)
sub2.to_csv('/home/priya/Downloads/PredictTheHappiness/submissions/sub2_tf.csv', index=False)