# TripAdvisor Activities Dataset

In [19]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/activities_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [20]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [21]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

## Feature transformation and regularization

## Train/Test Split

In [22]:
len(data)

28389

In [23]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [24]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21548,21549,21550,21551,21552,21553,21554,21555,21556,21557
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
tfidf_vect.vocabulary_

{'-PRON-': 51,
 'drive': 6968,
 'an': 2214,
 'hour': 10055,
 'and': 2228,
 'a': 1537,
 'half': 9503,
 'to': 19379,
 'get': 8969,
 'there': 19136,
 '.': 80,
 'when': 21017,
 'finally': 8215,
 'arrive': 2538,
 ',': 27,
 'be': 3129,
 'wait': 20742,
 'for': 8477,
 'ticket': 19275,
 'over': 13982,
 '90': 1445,
 'minute': 12737,
 'go': 9090,
 'up': 20293,
 'just': 11130,
 'grab': 9196,
 'something': 17840,
 'eat': 7135,
 'leave': 11593,
 'happen': 9573,
 'upon': 20318,
 'julie': 11100,
 "'s": 21,
 'website': 20919,
 'while': 21031,
 'do': 6781,
 'some': 17831,
 'research': 16152,
 'on': 13750,
 'thing': 19171,
 'in': 10377,
 'san': 16725,
 'diego': 6463,
 'from': 8689,
 'the': 19096,
 'moment': 12867,
 'first': 8264,
 'connect': 5313,
 'with': 21185,
 'know': 11336,
 'that': 19090,
 'have': 9640,
 'find': 8222,
 'gem': 8912,
 'thorough': 19197,
 'responsive': 16198,
 'take': 18822,
 'time': 19318,
 'understand': 20021,
 'what': 20997,
 'want': 20792,
 'experience': 7810,
 'family': 8002,
 'o

## Modelling

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import time


### Random Forest Classifier (RF)

In [42]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [43]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 257.922 seconds
Prediction done in 2.382 seconds


In [60]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.85      0.86      0.86      2000
   positive       0.86      0.85      0.86      2000

avg / total       0.86      0.86      0.86      4000

accuracy: 0.86
ROC AUC Score : 0.856
F1 Score : 0.855421686746988


### Support Vector Machine Classifier (SVM)

In [27]:
clf = svm.LinearSVC(random_state=10)

In [28]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))


Training done in 5.341 seconds
Prediction done in 0.667 seconds


In [58]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.88      0.88      0.88      2000
   positive       0.88      0.88      0.88      2000

avg / total       0.88      0.88      0.88      4000

accuracy: 0.88
ROC AUC Score : 0.8764999999999998
f1 Score : 0.8766233766233766


### Naive Bayes Classifier (NB)

In [30]:
nbc = MultinomialNB()

In [31]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 3.274 seconds
Prediction done in 0.518 seconds


In [59]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.87      0.88      0.88      2000
   positive       0.88      0.87      0.88      2000

avg / total       0.88      0.88      0.88      4000

accuracy: 0.88
ROC AUC Score : 0.878
F1 Score : 0.8773252890899951
