# Restaurant Review Classification Based on 3 Aspect

Micheila Jiemesha - 0706012110032
<br>Marsha Alexis Likorawung - 0706012110034
<br>Michelle Swastika Bianglala Nusantara - 0706012110002
<br>Rifqie Tilqa Reamizard - 0706012110025

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import joblib
import re
from nltk.corpus import stopwords

In [None]:
data = pd.read_csv('restaurant_reviews_labelled.csv')

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
data['Review'] = data['Review'].apply(preprocess_text)

In [None]:
data.to_csv('restaurant_reviews_preprocessed.csv', index=False)

In [None]:
targets = ['Food & Drinks Quality & Price', 'General (Ambience, Entertainment, & Experience)', 'Service']
for target in targets:
    data[target] = data[target].fillna('Neutral')
datasets = {}

In [None]:
data[targets[0]].value_counts()

Food & Drinks Quality & Price
Good       3405
Bad        1307
Neutral     524
Name: count, dtype: int64

In [None]:
data[targets[1]].value_counts()

General (Ambience, Entertainment, & Experience)
Good       2760
Neutral    1384
Bad        1092
Name: count, dtype: int64

In [None]:
data[targets[2]].value_counts()

Service
Good       2594
Neutral    1550
Bad        1092
Name: count, dtype: int64

In [None]:
#for target in targets:
  #  filtered_data = data[['Review', target]].dropna(subset=[target])
   # datasets[target] = filtered_data

datasets = {}
for target in targets:
    filtered_data = data[['Review', target]]
    datasets[target] = filtered_data

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import joblib

def train_and_evaluate_model(datasets, model_class, **model_params):
    models = {}

    for target, dataset in datasets.items():
        X = dataset['Review']
        y = dataset[target].map({'Good': 2, 'Neutral': 1, 'Bad': 0})

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        vectorizer = TfidfVectorizer(max_features=5000)
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # Modify SMOTE for multi-class
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

        model = model_class(**model_params)
        model.fit(X_train_resampled, y_train_resampled)

        y_pred = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy for {target}: {accuracy:.2f}')
        print(f'Performance for {target}:')
        print(classification_report(y_test, y_pred))

        joblib.dump(model, f'{target}_model.pkl')
        joblib.dump(vectorizer, f'{target}_vectorizer.pkl')
        models[target] = (model, vectorizer)
        print()

    return models

In [None]:
def predict_review(review, models):
    predictions = {}

    for target in models:
        model, vectorizer = models[target]
        review_vec = vectorizer.transform([review])
        prediction = model.predict(review_vec)[0]

        predictions[target] = {
            2: 'Good',
            1: 'Neutral',
            0: 'Bad'
        }[prediction]

    return predictions


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=LogisticRegression,
    max_iter=1000,
    random_state=42
)

Accuracy for Food & Drinks Quality & Price: 0.79
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76       251
           1       0.36      0.43      0.39       109
           2       0.90      0.86      0.88       688

    accuracy                           0.79      1048
   macro avg       0.67      0.69      0.68      1048
weighted avg       0.81      0.79      0.80      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.70
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.65      0.70      0.67       222
           1       0.55      0.57      0.56       268
           2       0.80      0.76      0.78       558

    accuracy                           0.70      1048
   macro avg       0.67      0.68      0.67      1048
weighted avg       0.70      0.70      0.70      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, lr_models))

{'Food & Drinks Quality & Price': 'Neutral', 'General (Ambience, Entertainment, & Experience)': 'Bad', 'Service': 'Bad'}


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=RandomForestClassifier,
    n_estimators=100,
    random_state=42
)

Accuracy for Food & Drinks Quality & Price: 0.77
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.67      0.69      0.68       251
           1       0.10      0.01      0.02       109
           2       0.81      0.92      0.86       688

    accuracy                           0.77      1048
   macro avg       0.53      0.54      0.52      1048
weighted avg       0.70      0.77      0.73      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.66
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.60      0.51      0.55       222
           1       0.56      0.54      0.55       268
           2       0.73      0.78      0.75       558

    accuracy                           0.66      1048
   macro avg       0.63      0.61      0.62      1048
weighted avg       0.66      0.66      0.66      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, rf_models))

{'Food & Drinks Quality & Price': 'Good', 'General (Ambience, Entertainment, & Experience)': 'Neutral', 'Service': 'Bad'}


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=DecisionTreeClassifier,
    max_depth=None,
    random_state=42
)

Accuracy for Food & Drinks Quality & Price: 0.67
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.55      0.57      0.56       251
           1       0.24      0.32      0.27       109
           2       0.81      0.76      0.78       688

    accuracy                           0.67      1048
   macro avg       0.53      0.55      0.54      1048
weighted avg       0.69      0.67      0.68      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.57
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.48      0.51      0.50       222
           1       0.42      0.51      0.46       268
           2       0.71      0.62      0.66       558

    accuracy                           0.57      1048
   macro avg       0.54      0.55      0.54      1048
weighted avg       0.59      0.57      0.58      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, dt_models))

{'Food & Drinks Quality & Price': 'Neutral', 'General (Ambience, Entertainment, & Experience)': 'Bad', 'Service': 'Good'}


## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=KNeighborsClassifier,
    n_neighbors=10
)

Accuracy for Food & Drinks Quality & Price: 0.24
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.44      0.64      0.52       251
           1       0.12      0.74      0.21       109
           2       1.00      0.01      0.02       688

    accuracy                           0.24      1048
   macro avg       0.52      0.46      0.25      1048
weighted avg       0.77      0.24      0.16      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.32
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.34      0.84      0.49       222
           1       0.31      0.57      0.40       268
           2       0.33      0.00      0.00       558

    accuracy                           0.32      1048
   macro avg       0.33      0.47      0.30      1048
weighted avg       0.33      0.32      0.21      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, knn_models))

{'Food & Drinks Quality & Price': 'Neutral', 'General (Ambience, Entertainment, & Experience)': 'Bad', 'Service': 'Bad'}


## XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=XGBClassifier,
    eval_metric='logloss',
    random_state=42
)

Accuracy for Food & Drinks Quality & Price: 0.78
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.72      0.71      0.72       251
           1       0.32      0.19      0.24       109
           2       0.84      0.90      0.87       688

    accuracy                           0.78      1048
   macro avg       0.63      0.60      0.61      1048
weighted avg       0.76      0.78      0.77      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.67
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       222
           1       0.51      0.60      0.55       268
           2       0.77      0.74      0.76       558

    accuracy                           0.67      1048
   macro avg       0.65      0.64      0.64      1048
weighted avg       0.68      0.67      0.67      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, xgb_models))

{'Food & Drinks Quality & Price': 'Good', 'General (Ambience, Entertainment, & Experience)': 'Bad', 'Service': 'Bad'}


## LightGBM

In [None]:
from lightgbm import LGBMClassifier

lgbm_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=LGBMClassifier,
    boosting_type='gbdt',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,
    random_state=42
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 130654
[LightGBM] [Info] Number of data points in the train set: 8151, number of used features: 2786
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Accuracy for Food & Drinks Quality & Price: 0.79
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       251
           1       0.33      0.24      0.28       109
           2       0.86      0.90      0.88       688

    accuracy                           0.79      1048
   macro avg       0.64      0.62      0.63      1048
weighted avg       0.77      0.79      0.78      1048


[Lig

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, lgbm_models))

{'Food & Drinks Quality & Price': 'Good', 'General (Ambience, Entertainment, & Experience)': 'Neutral', 'Service': 'Bad'}


## CatBoost

In [None]:
from catboost import CatBoostClassifier

catboost_models = train_and_evaluate_model(
    datasets=datasets,
    model_class=CatBoostClassifier,
    iterations=500,
    learning_rate=0.1,
    depth=6,
    verbose=0,
    random_seed=42
)

Accuracy for Food & Drinks Quality & Price: 0.77
Performance for Food & Drinks Quality & Price:
              precision    recall  f1-score   support

           0       0.72      0.67      0.70       251
           1       0.29      0.22      0.25       109
           2       0.84      0.89      0.87       688

    accuracy                           0.77      1048
   macro avg       0.62      0.59      0.60      1048
weighted avg       0.75      0.77      0.76      1048


Accuracy for General (Ambience, Entertainment, & Experience): 0.68
Performance for General (Ambience, Entertainment, & Experience):
              precision    recall  f1-score   support

           0       0.70      0.59      0.64       222
           1       0.51      0.69      0.59       268
           2       0.81      0.72      0.76       558

    accuracy                           0.68      1048
   macro avg       0.67      0.67      0.66      1048
weighted avg       0.71      0.68      0.69      1048


Accuracy

In [None]:
test_review = "We had a 5PM reservation on Saturday and arrived at 4:54. We were told seating began at 5PM and were asked to wait. Ten minutes later, we were still standing along with a minimum 50 others in a very small and increasingly crowded entryway and no one was being seated, although 4 hosts/hostesses talked among themselves standing idly behind the reception desk. Progress was not evident. We had a 7PM concert, so we left and were happily accommodated elsewhere."

print(predict_review(test_review, catboost_models))

TypeError: unhashable type: 'numpy.ndarray'