In [27]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [28]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# Load data
df = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')

In [30]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [31]:
# Define the text cleaning function
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    # Join words to form the cleaned text
    cleaned_text = ' '.join(filtered_words)
    # Strip whitespace
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [32]:
# Apply the cleaning function to the review column
df['processed_review'] = df['Review'].apply(clean_text)

In [33]:
df.head(10)

Unnamed: 0,Review,Rating,processed_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms 4 experience hotel monaco seattle g...
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...
5,love monaco staff husband stayed hotel crazy w...,5,love monaco staff husband stayed hotel crazy w...
6,"cozy stay rainy city, husband spent 7 nights m...",5,cozy stay rainy city husband spent 7 nights mo...
7,"excellent staff, housekeeping quality hotel ch...",4,excellent staff housekeeping quality hotel cho...
8,"hotel stayed hotel monaco cruise, rooms genero...",5,hotel stayed hotel monaco cruise rooms generou...
9,excellent stayed hotel monaco past w/e delight...,5,excellent stayed hotel monaco past delight rec...


In [34]:
df['Rating'].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,9054
4,6039
3,2184
2,1793
1,1421


In [35]:
# vecrorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Control the vector's dimensionality
X = vectorizer.fit_transform(df['processed_review'])

In [36]:
# Adjust the mapping function to categorize ratings into three groups
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))


In [37]:
df.head()

Unnamed: 0,Review,Rating,processed_review,Sentiment
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,positive
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,negative
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms 4 experience hotel monaco seattle g...,neutral
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,positive
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,positive


In [38]:
y=df['Sentiment']

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
import numpy as np

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the unique classes in the training set
print("Classes in y_train before SMOTE:", np.unique(y_train))

# Apply SMOTE to the training data only if there are multiple classes
if len(np.unique(y_train)) > 1:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
else:
    X_train_resampled, y_train_resampled = X_train, y_train  # No resampling if only one class

# Initialize and train the classifier
model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)


Classes in y_train before SMOTE: ['negative' 'neutral' 'positive']


In [40]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the predictions
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.738472798243474
              precision    recall  f1-score   support

    negative       0.64      0.74      0.69       643
     neutral       0.26      0.55      0.35       437
    positive       0.95      0.77      0.85      3019

    accuracy                           0.74      4099
   macro avg       0.62      0.69      0.63      4099
weighted avg       0.83      0.74      0.77      4099



Overall Interpretation:
Negative Class:

Precision: 0.64
Out of all instances predicted as negative, 64% were actually negative. There are false positives (FP) in this class.

Recall: 0.74
Out of all actual negative instances, 74% were correctly predicted. There are false negatives (FN) here.

F1-Score: 0.69
The harmonic mean of precision and recall, providing a balance between the two metrics.
Neutral Class:

Precision: 0.26
Only 26% of instances predicted as neutral were actually neutral, indicating a high number of false positives (FP).

Recall: 0.55
55% of actual neutral instances were correctly identified, indicating some false negatives (FN).

F1-Score: 0.35
A low score reflects the challenges in correctly identifying neutral instances.
Positive Class:

Precision: 0.95
95% of instances predicted as positive were indeed positive, indicating very few false positives (FP).

Recall: 0.77
77% of actual positive instances were correctly predicted, indicating some false negatives (FN).

F1-Score: 0.85
A strong score indicating good performance in identifying positive instances.