# TripAdvisor Activities Dataset

In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/general_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [3]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [4]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

## Feature transformation and regularization

## Train/Test Split

In [5]:
len(data)

90000

In [6]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [7]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27836,27837,27838,27839,27840,27841,27842,27843,27844,27845
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.083897,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf_vect.vocabulary_

{'-PRON-': 92,
 'buy': 5261,
 'this': 24831,
 'for': 10748,
 'samsung': 21627,
 'galaxy': 11178,
 'note': 17482,
 '3': 1082,
 'and': 2930,
 'be': 4077,
 'the': 24695,
 'slow': 22856,
 'charger': 5945,
 'have': 12270,
 'ever': 9742,
 'use': 26273,
 '.': 130,
 'will': 27218,
 'keep': 14461,
 'phone': 18936,
 'from': 11016,
 'die': 8267,
 'but': 5229,
 'percentage': 18777,
 'never': 17233,
 'go': 11546,
 'up': 26176,
 'not': 17471,
 'impressed': 13241,
 'continue': 6985,
 'to': 25082,
 'old': 17794,
 'verizon': 26468,
 'on': 17826,
 'new': 17240,
 'under': 25798,
 'renovation': 20769,
 'right': 21127,
 'now': 17527,
 ',': 39,
 'park': 18491,
 'so': 23002,
 'lovely': 15449,
 'with': 27308,
 'many': 15795,
 'thing': 24812,
 'do': 8626,
 'in': 13262,
 'one': 17838,
 'place': 19123,
 'if': 13093,
 'like': 15145,
 'outdoor': 18095,
 'without': 27317,
 'feel': 10269,
 'city': 6287,
 'an': 2909,
 'individual': 13418,
 'family': 10118,
 'or': 17936,
 'friend': 10969,
 'all': 2695,
 'year': 27599,

## Modelling

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.metrics import classification_report
import time

### Random Forest Classifier (RF)

In [17]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [18]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 386.004 seconds
Prediction done in 1.785 seconds


In [19]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.84      0.86      0.85      2000
   positive       0.86      0.83      0.85      2000

avg / total       0.85      0.85      0.85      4000

accuracy: 0.85
ROC AUC Score : 0.84875
F1 Score : 0.8462515883100381


### Support Vector Machine Classifier (SVM)

In [11]:
clf = svm.LinearSVC(random_state=10)

In [12]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))




Training done in 6.017 seconds
Prediction done in 1.117 seconds


In [13]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.89      0.87      0.88      2000
   positive       0.87      0.89      0.88      2000

avg / total       0.88      0.88      0.88      4000

accuracy: 0.88
ROC AUC Score : 0.87875
f1 Score : 0.8798017348203222


### Naive Bayes Classifier (NB)

In [14]:
nbc = MultinomialNB()

In [15]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 3.733 seconds
Prediction done in 0.938 seconds


In [16]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.84      0.86      0.85      2000
   positive       0.85      0.84      0.84      2000

avg / total       0.85      0.85      0.85      4000

accuracy: 0.85
ROC AUC Score : 0.84625
F1 Score : 0.8446577418540036
