# TripAdvisor Restaurants Dataset

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/restaurants_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [2]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [3]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

## Feature transformation and regularization

## Train/Test Split

In [4]:
len(data)

40000

In [5]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19621,19622,19623,19624,19625,19626,19627,19628,19629,19630
0,0.0,0.0,0.062134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.160482,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
tfidf_vect.vocabulary_

{'the': 17450,
 'service': 15536,
 'be': 2677,
 'poor': 13585,
 '.': 102,
 '-PRON-': 60,
 'have': 8676,
 'to': 17668,
 'get': 8061,
 'up': 18477,
 'utensil': 18549,
 'menu': 11277,
 'look': 10652,
 'like': 10503,
 'drag': 6261,
 'and': 1847,
 'step': 16599,
 'on': 12435,
 'by': 3628,
 'hundred': 9078,
 'of': 12349,
 'people': 13135,
 'therefore': 17476,
 ',': 36,
 'cleanliness': 4445,
 'questionable': 14126,
 'chicken': 4239,
 'tikka': 17605,
 'masala': 11059,
 'tasteless': 17250,
 'wine': 19237,
 'bad': 2467,
 'ever': 6869,
 'taste': 17243,
 'this': 17513,
 'restaurant': 14675,
 'first': 7389,
 'thing': 17499,
 'see': 15445,
 'after': 1553,
 'pass': 12993,
 'security': 15442,
 'for': 7599,
 'canadian': 3755,
 'flight': 7483,
 'in': 9284,
 'gate': 7986,
 '3': 717,
 'suggest': 16848,
 'keep': 9998,
 'walk': 18925,
 '...': 104,
 'do': 6132,
 'not': 12188,
 'stop': 16657,
 '!': 2,
 'husband': 9099,
 "'s": 24,
 'prime': 13829,
 'rib': 14774,
 'good': 8207,
 'fine': 7358,
 '-': 38,
 'just':

## Modelling

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import time

### Random Forest Classifier (RF)

In [10]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [11]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 233.858 seconds
Prediction done in 1.695 seconds


In [12]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.86      0.87      0.87      2000
   positive       0.87      0.86      0.87      2000

avg / total       0.87      0.87      0.87      4000

accuracy: 0.87
ROC AUC Score : 0.8664999999999999
F1 Score : 0.8658291457286432


### Support Vector Machine Classifier (SVM)

In [13]:
clf = svm.LinearSVC(random_state=10)

In [14]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))




Training done in 5.473 seconds
Prediction done in 0.674 seconds


In [15]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.90      0.89      0.89      2000
   positive       0.89      0.90      0.90      2000

avg / total       0.89      0.89      0.89      4000

accuracy: 0.89
ROC AUC Score : 0.8947499999999999
f1 Score : 0.8950386437297432


### Naive Bayes Classifier (NB)

In [16]:
nbc = MultinomialNB()

In [17]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 2.897 seconds
Prediction done in 0.254 seconds


In [18]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.88      0.88      0.88      2000
   positive       0.88      0.87      0.88      2000

avg / total       0.88      0.88      0.88      4000

accuracy: 0.88
ROC AUC Score : 0.8782500000000001
F1 Score : 0.8777303540045192
