In [12]:
##########################################
# --- Importing the standard libraries ---
##########################################

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [4]:
################################
# --- Importing the Dataset ---
################################

data = pd.read_csv('../Prepared Data/Hotel_Reviews.csv')
data.head()

Unnamed: 0,Doc_ID,Review,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,OVERALL_RATING
0,china_beijing_holiday_inn_central_plaza,"[""Just about everything about this hotel is fa...",4.786,4.631,4.733,3.553,4.699,4.481
1,china_beijing_hilton_beijing_wangfujing,"['An excellent hotel, with the best room I hav...",4.81,4.845,4.759,4.828,4.517,4.752
2,china_beijing_hotel_g,"['It was chic, everyone was friendly, service ...",4.769,4.75,4.577,4.375,4.654,4.625
3,china_beijing_the_regent_beijing,"[""My parents and I stayed here during their vi...",4.625,4.812,4.438,4.646,4.531,4.61
4,china_beijing_the_st_regis_beijing,['this hotel was fantastic. rooms were lovely....,4.846,4.646,4.615,4.492,4.185,4.557


In [1]:
#############################################
#--- Loading the Corpus of Cleaned Reviews---
#############################################

corpus = []
with open('../Corpus/corpus.txt') as f:
    for line in f.readlines():
        corpus.append(line)

## Modelling

In [78]:
###############################################
# --- Chosing our Aspect for Classification ---
###############################################

# aspect = data.ROOM           # Accuracy Score:-> 60.331 %

# aspect = data.CLEANLINESS    # Accuracy Score:-> 58.197 %

# aspect = data.SERVICE        # Accuracy Score:-> 62.918 %

# aspect = data.LOCATION       # Accuracy Score:-> 61.169 %

# aspect = data.VALUE          # Accuracy Score:-> 59.866 %


# aspect = data.OVERALL_RATING        # Accuracy Scores:-> 70.213 % 

In [79]:
#########################################
# --- Creating the Bag of Words model ---
#########################################


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()
y = list(map(int, aspect.round()))

In [80]:
##################################################################
# --- Splitting the dataset into the Training set and Test set ---
##################################################################

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [81]:
########################################################################
# ---  Fitting Classifier to the Training set Predicting the results ---
########################################################################

from sklearn.naive_bayes import MultinomialNB

# Training our Classifier
classifier = MultinomialNB(alpha=0.00001)
classifier.fit(X_train, y_train)

# Predicting the Results
y_pred = classifier.predict(X_test)

# Computing the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy Score:->", round(accuracy_score(y_test, y_pred) * 100,3), '%')


#---Accuracy Scores---

# MultinomialNB      # Accuracy Scores:-> 70.213 % , 65.653 % , 69.301 % , 71.125 %
# SVC                # Accuracy Scores:-> 65.35 %

Accuracy Score:-> 55.319 %


In [82]:
##############################################################
# ---  Analyzing the Results using K-Fold Cross Validation ---
##############################################################

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

mean_accuracy = accuracies.mean()
std_accuracy = accuracies.std()

print("Average Accuracy Score:->", round(mean_accuracy * 100,3), '%')
print("Standard Deviation:->", round(std_accuracy,3))

Average Accuracy Score:-> 61.169 %
Standard Deviation:-> 0.034


In [32]:
########################################################
# ---  Tuning the Hyper Parameters using Grid Search ---
########################################################

from sklearn.model_selection import GridSearchCV
# parameters = [{'alpha': [0.1,0.01,0.001]}]        # Best value: 0.001
# parameters = [{'alpha': [0.001,0.002,0.003]}]     # Best value: 0.001
# parameters = [{'alpha': [0.001,0.0001]}]          # Best value: 0.0001
# parameters = [{'alpha': [0.0001, 0.00001]}]       # Best value: 0.00001
parameters = [{'alpha': [0.000001, 0.00001]}]       # Best value: 0.00001 (Final Selected Value)

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)


grid_search = grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best Accuracy Score:->", round(best_accuracy * 100,3), '%')
print("Best Parameters:->", best_parameters)

Best Accuracy Score:-> 65.779 %
Best Parameters:-> {'alpha': 1e-05}


---