In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
from StringIO import StringIO

# Read csv file from GCS into a variable
%storage read --object gs://databyte/alldata.csv --variable all_data
%storage read --object gs://databyte/train.csv --variable train_csv
%storage read --object gs://databyte/test.csv --variable test_csv

# Store in a pandas dataframe
alldata = pd.read_csv(StringIO(all_data))
train = pd.read_csv(StringIO(train_csv))
test = pd.read_csv(StringIO(test_csv))

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [5]:
alldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


### Feature 1 : Number of Capital Words 

In [9]:
string = alldata['Description'][0]

In [246]:
def NumberOfCaps(x):
    count = 0
    word_list = x.split(' ')
    for word in word_list:
      if word.isupper() and len(word)>3:
        count = count + 1
      else:
        pass
    return count

In [247]:
NumberOfCaps("HIS I AM SHAHebAZ AIYAZ absda adsadasfafad f . sddfgdff ")

1

### Feature 1 : Number of Question marks 

In [292]:
def NumberOfQuestion(x):
    count = 0 
    for letter in x:
        if letter in ["?", "!?", "??","???"]:
            count = count + 1
    return count    

In [293]:
alldata['Number of Question Marks'] = alldata['Description'].apply(NumberOfQuestion)

In [306]:
alldata[alldata['Number of Question Marks']>5].head(10)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money
433,id10759,My husband and I stayed at the Palmer House fo...,Edge,Mobile,not happy,11,30,1
564,id10890,The hotel was terrible! After waiting for our ...,Mozilla Firefox,Tablet,not happy,14,4,0
672,id10998,I checked in and we had reservations for - roo...,Firefox,Mobile,not happy,14,0,0
675,id11001,For starters I would never stay at this hotel ...,Mozilla,Tablet,not happy,9,6,0
838,id11164,Tiny Rooms..Bathrooms the size of the closet.....,Google Chrome,Mobile,not happy,6,8,0
2209,id12535,We waited over an hour to be checked in. Other...,Firefox,Mobile,not happy,9,0,0
2328,id12654,Room: Stained carpet from a previous ceiling l...,Chrome,Desktop,not happy,7,25,0
2383,id12709,I just returned from a - day - night stay in H...,IE,Tablet,not happy,6,3,0
2504,id12830,It only received one bubble because I was unab...,IE,Tablet,not happy,11,7,1
5134,id15460,Trip in mid July was -rd or -th time staying i...,Google Chrome,Desktop,happy,9,0,0


### Feature 2 : Number of Exclamation marks 

In [299]:
def NumberOfExclamation(x):
    count = 0 
    for letter in x:
        if letter in ["!"] :
            count = count + 1
    return count    

In [302]:
alldata['Number of Exclamation marks'] = alldata['Description'].apply(NumberOfExclamation)

In [303]:
alldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Number of Question Marks,Number of Exclamation marks
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0,0
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0,1
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0,0
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0,3
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0,1


### Feature 3 : Mention of Money 

In [289]:
def NumberOfDollar(x):
    count = 0 
    word_list = x.split(' ')
    for i in word_list:
      if i in ['dollar','$', 'USD', 'Dollar','$---', '$-.--','$$$','$$','$$$$','$--']:
        count = count +1
    return count

In [304]:
alldata['Number of times Mention of Money'] = alldata['Description'].apply(NumberOfDollar)

In [305]:
alldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0,0,0
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0,1,0
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0,0,1
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0,3,0
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0,1,0


### Feature 4: Last 1/5th Sentimental Analysis

In [308]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [309]:
def sentimenal_pos(x):
    last_stand = ' '.join((x).split(' ')[len(x.split(' ')) - len(x.split(' '))/5:])
    ss = sid.polarity_scores(last_stand)
    return (ss['pos']-0.20)

In [310]:
alldata['Positive Polarity'] = alldata['Description'].apply(sentimenal_pos)

In [311]:
alldata.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money,Positive Polarity
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0,0,0,0.27
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0,1,0,-0.2
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0,0,1,-0.009
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0,3,0,0.349
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0,1,0,-0.09


## Cleaning the Data

In [314]:
import nltk
nltk.download('stopwords')
# function to clean data
stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [315]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [318]:
alldata.drop('User_ID', axis=1, inplace=True)

In [342]:
alldata.head(4)

Unnamed: 0,Description,Browser_Used,Device_Used,Is_Response,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money,Positive Polarity
0,room kind clean strong smell dog gener averag ...,Edge,Mobile,not happy,0,0,0,0.27
1,stay crown plaza april april staff friendli at...,Internet Explorer,Mobile,not happy,0,1,0,-0.2
2,book hotel hotwir lowest price could find got ...,Mozilla,Tablet,not happy,0,0,1,-0.009
3,stay husband son way alaska cruis love hotel g...,InternetExplorer,Desktop,happy,0,3,0,0.349


## TF-IDF Vectorization

In [343]:
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,3), min_df = 30)

In [344]:
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [345]:
feature_names = tfidfvec.get_feature_names()
len(feature_names)

30031

In [346]:
# create dataframe for features
tfidf_df = pd.DataFrame(tfidfdata.todense())

# set column names
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

# create separate data frame for tf-idf
tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [347]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [348]:
### set target variable
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [349]:
cols = ['Browser_Used','Device_Used','Number of Question Marks','Number of Exclamation marks','Number of times Mention of Money','Positive Polarity']
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

train_feats2 = pd.get_dummies(train_feats2, prefix='_',columns=['Browser_Used','Device_Used'])
test_feats2= pd.get_dummies(test_feats2, prefix='_',columns=['Browser_Used','Device_Used'])

In [350]:
train_feats2.head(2)

Unnamed: 0,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money,Positive Polarity,col0,col1,col2,col3,col4,col5,...,__IE,__Internet Explorer,__InternetExplorer,__Mozilla,__Mozilla Firefox,__Opera,__Safari,__Desktop,__Mobile,__Tablet
0,0,0,0,0.27,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0


In [360]:
train_feats2.drop('Positive Polarity', axis=1, inplace=True)
test_feats2.drop('Positive Polarity', axis=1, inplace=True)

In [362]:
train_feats2.head()

Unnamed: 0,Number of Question Marks,Number of Exclamation marks,Number of times Mention of Money,col0,col1,col2,col3,col4,col5,col6,...,__IE,__Internet Explorer,__InternetExplorer,__Mozilla,__Mozilla Firefox,__Opera,__Safari,__Desktop,__Mobile,__Tablet
0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
2,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,1
3,0,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
4,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [352]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Logarithm Models

In [363]:
X =  train_feats2
y =  train_feats['Is_Response']

In [364]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [365]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=3,penalty="l2",solver='lbfgs',warm_start=True)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [366]:
# Train Test Stack
logreg.fit(X_train, y_train)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=True)

In [367]:
y_logpred = logreg.predict(X_test)

In [368]:
print "Train Set Score: {}".format(logreg.score(X_train, y_train))
print "Test Set Score: {}".format(logreg.score(X_test, y_test))
print "Classification report"
print classification_report(y_test, y_logpred)
print "Confusion Matrix"
print confusion_matrix(y_test, y_logpred)

Train Set Score: 0.944724134388
Test Set Score: 0.905989931162
Classification report
             precision    recall  f1-score   support

          0       0.88      0.82      0.85      3092
          1       0.92      0.95      0.93      6641

avg / total       0.91      0.91      0.91      9733

Confusion Matrix
[[2534  558]
 [ 357 6284]]


In [369]:
logreg_sub  = LogisticRegression(C=3,penalty="l2",solver='lbfgs')

logreg_sub.fit(train_feats2, y)

log_pred = logreg_sub.predict(test_feats2)

def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':log_pred})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

sublog = sub1[['User_ID', 'Is_Response']]

In [370]:
## write submission files
sublog.to_csv('cloud_featv3.csv', index=False)