# TripAdvisor Hotels Dataset

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/hotels_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [2]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [3]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

## Feature transformation and regularization

## Train/Test Split

In [4]:
len(data)

40000

In [5]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20198,20199,20200,20201,20202,20203,20204,20205,20206,20207
0,0.0,0.184856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.075635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf_vect.vocabulary_

{'old': 13151,
 'and': 2628,
 'tired': 18228,
 'room': 15636,
 '.': 105,
 ' ': 1,
 'electrical': 7230,
 'outlet': 13345,
 'at': 3017,
 'the': 18001,
 'desk': 6365,
 'that': 17996,
 'be': 3454,
 'loose': 11495,
 'will': 19804,
 'not': 12934,
 'reliably': 15147,
 'power': 14248,
 'device': 6413,
 'sleazy': 16610,
 '"': 12,
 'facility': 7872,
 'fee': 8023,
 'for': 8357,
 'thing': 18077,
 'already': 2505,
 'free': 8469,
 'as': 2941,
 'a': 1979,
 'gold': 8892,
 'starwood': 17188,
 'member': 12043,
 'this': 18089,
 'resort': 15312,
 'love': 11534,
 'turn': 18608,
 'down': 6896,
 'service': 16228,
 ',': 41,
 'make': 11710,
 '-PRON-': 65,
 'feel': 8032,
 'special': 16947,
 'w': 19440,
 'music': 12543,
 'in': 10058,
 'background': 3225,
 'water': 19583,
 'bottle': 3948,
 'do': 6784,
 'get': 8785,
 'chance': 4663,
 'to': 18249,
 'try': 18575,
 'any': 2720,
 'of': 13107,
 'restaurant': 15352,
 'or': 13250,
 'bar': 3342,
 'wish': 19853,
 'parking': 13597,
 'little': 11393,
 'cheap': 4732,
 'pay': 

## Modelling

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import time

### Random Forest Classifier (RF)

In [15]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [16]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 214.658 seconds
Prediction done in 1.387 seconds


In [17]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.87      0.88      0.87      2000
   positive       0.88      0.87      0.87      2000

avg / total       0.87      0.87      0.87      4000

accuracy: 0.87
ROC AUC Score : 0.8724999999999999
F1 Score : 0.8720521826392373


### Support Vector Machine Classifier (SVM)

In [9]:
clf = svm.LinearSVC(random_state=10)

In [10]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))




Training done in 5.973 seconds
Prediction done in 0.679 seconds


In [11]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.90      0.90      0.90      2000
   positive       0.90      0.90      0.90      2000

avg / total       0.90      0.90      0.90      4000

accuracy: 0.9
ROC AUC Score : 0.9002499999999999
f1 Score : 0.9000751314800902


### Naive Bayes Classifier (NB)

In [12]:
nbc = MultinomialNB()

In [13]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 3.427 seconds
Prediction done in 0.387 seconds


In [14]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.89      0.88      0.89      2000
   positive       0.88      0.89      0.89      2000

avg / total       0.89      0.89      0.89      4000

accuracy: 0.89
ROC AUC Score : 0.8865000000000001
F1 Score : 0.8869521912350596
