In [2]:
import pandas as pd

In [3]:
dataset = pd.read_csv("hotel reviews.csv")

In [4]:
combined_reviews = pd.concat([dataset['Positive_Review'], dataset['Negative_Review']], ignore_index=True)
labels = ['positive'] * len(dataset['Positive_Review']) + ['negative'] * len(dataset['Negative_Review'])

dataset = pd.DataFrame({
    'Review': combined_reviews,
    'Label': labels
})

In [5]:
dataset = dataset[~dataset['Review'].isin(['No Positive', 'No Negative'])]

print(dataset.head(5))
print(dataset.tail(5))


                                              Review     Label
0   Only the park outside of the hotel was beauti...  positive
1   No real complaints the hotel was great great ...  positive
2   Location was good and staff were ok It is cut...  positive
3   Great location in nice surroundings the bar a...  positive
4    Amazing location and building Romantic setting   positive
                                                    Review     Label
1031469   No parking Public parking garage is 15 Euro p...  negative
1031471   no trolly or staff to help you take the lugga...  negative
1031472           The hotel looks like 3 but surely not 4   negative
1031473   The ac was useless It was a hot week in vienn...  negative
1031475       I was in 3rd floor It didn t work Free Wife   negative


In [6]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download

In [7]:
download('punkt') 
download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Mukhammad
[nltk_data]     Rizki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mukhammad
[nltk_data]     Rizki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
default_stopwords = set(stopwords.words('english'))

def preprocess_review(review):
    review = re.sub(r'#\w+', '', review)

    review = re.sub(r'[^\w\s]', '', review)

    review = re.sub(r'\s+', ' ', review).strip()

    tokens = word_tokenize(review)

    processed_tokens = [word for word in tokens if word.lower() not in default_stopwords]

    return ' '.join(processed_tokens)

In [9]:
dataset['Review'] = dataset['Review'].apply(preprocess_review)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [11]:
model = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())  # Model Naive Bayes
])

In [12]:
x_train, x_test, y_train, y_test = train_test_split(dataset['Review'], dataset['Label'], test_size=0.2, random_state=42)

In [13]:
model.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

In [14]:
y_pred = model.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='positive', average='binary')
recall = recall_score(y_test, y_pred, pos_label='positive', average='binary')
f1 = f1_score(y_test, y_pred, pos_label='positive', average='binary')

conf_matrix = confusion_matrix(y_test, y_pred)

class_report = classification_report(y_test, y_pred)

In [16]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.91
Precision: 0.91
Recall: 0.93
F1-Score: 0.92

Confusion Matrix:
[[68501  9021]
 [ 6886 89120]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.88      0.90     77522
    positive       0.91      0.93      0.92     96006

    accuracy                           0.91    173528
   macro avg       0.91      0.91      0.91    173528
weighted avg       0.91      0.91      0.91    173528



In [17]:
# import matplotlib.pyplot as plt
# import seaborn as sns

In [18]:
# plt.figure(figsize=(6, 5))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title('Confusion Matrix')
# plt.show()

In [19]:
from xgboost import XGBClassifier

In [22]:
model = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', XGBClassifier(tree_method='hist', n_estimators=100, max_depth=10, use_label_encoder=False, eval_metric='logloss'))
])

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
encoder = LabelEncoder()

y_train_encoded = encoder.fit_transform(y_train)

In [26]:
model.fit(x_train, y_train_encoded)

Parameters: { "use_label_encoder" } are not used.



Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='logloss',
                               feature_types=None, gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=10, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_str

In [27]:
y_pred = model.predict(x_test)

In [29]:
y_test_encoded = encoder.fit_transform(y_test)

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, pos_label=1, average='binary')
recall = recall_score(y_test_encoded, y_pred, pos_label=1, average='binary')
f1 = f1_score(y_test_encoded, y_pred, pos_label=1, average='binary')

conf_matrix = confusion_matrix(y_test_encoded, y_pred)

class_report = classification_report(y_test_encoded, y_pred)

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)