In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=2b615d28ef5f9043b43a7f4e8cad0e61e9dd3a6702322ba3d4682e25ada21c04
  Stored in directory: /tmp/pip-ephem-wheel-cache-q3dhm5pz/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import re
import numpy as np
import pandas as pd
import string
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Data Exploration

In [None]:
data = pd.read_csv("tripadvisor_hotel_reviews.csv")
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


# Preprocessing

## Drop NaN

In [None]:
data.isnull().sum()

Review    0
Rating    0
dtype: int64

In [None]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)
data.isnull().sum()

review       0
sentiment    0
dtype: int64

## Drop Duplicate

In [None]:
data.duplicated().sum()


0

In [None]:
data = data.drop_duplicates()
data.reset_index(drop=True, inplace=True)
data.duplicated().sum()


0

## Create Label

In [None]:
def create_label(x):
    if x >= 3:
        return 1
    elif x < 3:
        return 0

In [None]:
data['label'] = data['Rating'].apply(lambda x: create_label(x))
data.head()

Unnamed: 0,Review,Rating,label
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,0
2,nice rooms not 4* experience hotel monaco seat...,3,1
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,1


## Cleaning


In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([c for c in text if c not in string.punctuation])
    token = re.split('\W+', text) #split kata
    text = " ".join([ps.stem(word) for word in token if word not in stopwords])
    return text


In [None]:
data['review_clean'] = data['Review'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,Review,Rating,label,review_clean
0,nice hotel expensive parking got good deal sta...,4,1,nice hotel expens park got good deal stay hote...
1,ok nothing special charge diamond member hilto...,2,0,ok noth special charg diamond member hilton de...
2,nice rooms not 4* experience hotel monaco seat...,3,1,nice room 4 experi hotel monaco seattl good ho...
3,"unique, great stay, wonderful time hotel monac...",5,1,uniqu great stay wonder time hotel monaco loca...
4,"great stay great stay, went seahawk game aweso...",5,1,great stay great stay went seahawk game awesom...


# Neural Network

In [None]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
def get_vec(x):
  doc = nlp(x)
  vec = doc.vector
  return vec

In [None]:
data['vec'] = data['review_clean'].apply(lambda x: get_vec(x))
data.head()

Unnamed: 0,Review,Rating,label,review_clean,vec
0,nice hotel expensive parking got good deal sta...,4,1,nice hotel expens park got good deal stay hote...,"[0.08455361, 0.12279382, -0.11445145, -0.13207..."
1,ok nothing special charge diamond member hilto...,2,0,ok noth special charg diamond member hilton de...,"[0.01376342, 0.11431644, -0.08469456, -0.08688..."
2,nice rooms not 4* experience hotel monaco seat...,3,1,nice room 4 experi hotel monaco seattl good ho...,"[0.112627976, 0.09836831, -0.00865985, -0.1204..."
3,"unique, great stay, wonderful time hotel monac...",5,1,uniqu great stay wonder time hotel monaco loca...,"[0.08088346, 0.07656127, -0.0533279, -0.135253..."
4,"great stay great stay, went seahawk game aweso...",5,1,great stay great stay went seahawk game awesom...,"[0.010666103, 0.02413493, -0.067824185, -0.111..."


In [None]:
X = data['vec'].to_numpy()
X = X.reshape(-1, 1)

In [None]:
X = np.concatenate(np.concatenate(X, axis = 0), axis = 0).reshape(-1, 300)
X.shape

(20491, 300)

# Split Data

In [None]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, data['label'], test_size = 0.2)

# Model

## Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_Train, Y_Train)
Y_pred = model.predict(X_Test)
Y_pred

array([1, 1, 1, ..., 0, 0, 1])

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Hasil Akurasi Gaussian Naive Bayes:", accuracy_score(Y_Test, Y_pred)*100, "%")
print("Confusion Matrix")
print(confusion_matrix(Y_Test, Y_pred))
print("Classification report")
print(classification_report(Y_Test, Y_pred))

Hasil Akurasi Gaussian Naive Bayes: 78.16540619663333 %
Confusion Matrix
[[ 521  105]
 [ 790 2683]]
Classification report
              precision    recall  f1-score   support

           0       0.40      0.83      0.54       626
           1       0.96      0.77      0.86      3473

    accuracy                           0.78      4099
   macro avg       0.68      0.80      0.70      4099
weighted avg       0.88      0.78      0.81      4099



## Multinomial NB

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train2 = scaler.fit_transform(X_Train)
X_test2 = scaler.fit_transform(X_Test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train2, Y_Train)
Y_pred2 = model.predict(X_test2)
Y_pred2

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Hasil Akurasi Multinomial Naive Bayes:", accuracy_score(Y_Test, Y_pred2)*100, "%")
print("Confusion Matrix")
print(confusion_matrix(Y_Test, Y_pred2))
print("Classification report")
print(classification_report(Y_Test, Y_pred2))

Hasil Akurasi Multinomial Naive Bayes: 84.72798243474018 %
Confusion Matrix
[[   0  626]
 [   0 3473]]
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       626
           1       0.85      1.00      0.92      3473

    accuracy                           0.85      4099
   macro avg       0.42      0.50      0.46      4099
weighted avg       0.72      0.85      0.78      4099



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_Train,Y_Train)
Y_pred3 = model.predict(X_Test)
Y_pred3

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Hasil Akurasi SVM:", accuracy_score(Y_Test, Y_pred3)*100, "%")
print("Confusion Matrix")
print(confusion_matrix(Y_Test, Y_pred3))
print("Classification report")
print(classification_report(Y_Test, Y_pred3))

Hasil Akurasi SVM: 91.87606733349597 %
Confusion Matrix
[[ 366  260]
 [  73 3400]]
Classification report
              precision    recall  f1-score   support

           0       0.83      0.58      0.69       626
           1       0.93      0.98      0.95      3473

    accuracy                           0.92      4099
   macro avg       0.88      0.78      0.82      4099
weighted avg       0.91      0.92      0.91      4099

