IMPORTING LIBRARIES


In [None]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import kaleido

# Load data
df = pd.read_csv("hotel_reviews.csv")

DATA PRE PROCESSING AND CLEANING

In [47]:
#SECTION A:DATA PRE PROCESSING AND CLEANING

# Remove rows with missing review text
df.dropna(subset=["Review_Text"], inplace=True)

# Convert review date to datetime format for time-based analysis
df["Review_Date"] = pd.to_datetime(df["Review_Date"], format='%b-%y', errors='coerce')

# Create sentiment label based on rating (threshold >= 7 is positive)
df['Sentiment'] = (df['Rating(Out of 10)'] >= 7).astype(int)

# Create a set of common stopwords for text cleaning
stopwords = set("""
a about above after again against all am an and any are aren't as at be because been
before being below between both but by can't cannot could couldn't did didn't do does
doesn't doing don't down during each few for from further had hadn't has hasn't have
haven't having he he'd he'll he's her here here's hers herself him himself his how
how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most
mustn't my myself no nor not of off on once only or other ought our ours ourselves out
over own same shan't she she'd she'll she's should shouldn't so some such than that
that's the their theirs them themselves then there there's these they they'd they'll
they're they've this those through to too under until up very was wasn't we we'd we'll
we're we've were weren't what what's when when's where where's which while who who's
whom why why's with won't would wouldn't you you'd you'll you're you've your yours
yourself yourselves
""".split())

# Define function to clean text by removing URLs, punctuation, stopwords, etc.
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return " ".join([word for word in text.split() if word not in stopwords and len(word) > 2])

# Apply text cleaning to review column and create 'Cleaned_Review' column
df['Cleaned_Review'] = df['Review_Text'].apply(clean_text) # This line was missing, causing the KeyError

# Create new feature: length of each review (in words)
df['Review_Length'] = df['Review_Text'].apply(lambda x: len(x.split()))

# Create star level bins based on rating score
df['Star_Level'] = pd.cut(df['Rating(Out of 10)'], bins=[0, 4, 6, 8, 10], labels=['Low', 'Mid', 'High', 'Very High'])

MODEL TRAINING

In [48]:
# SECTION B: TRAIN SENTIMENT CLASSIFIERS
X = df['Cleaned_Review']
y = df['Sentiment']

# Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X_vec = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Model 1: Logistic Regression
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print("\nLogistic Regression Report:\n")
print(classification_report(y_test, y_pred_log))

# Model 2: Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("\nNaive Bayes Report:\n")
print(classification_report(y_test, y_pred_nb))



Logistic Regression Report:

              precision    recall  f1-score   support

           0       0.70      0.64      0.67       402
           1       0.86      0.89      0.87       997

    accuracy                           0.82      1399
   macro avg       0.78      0.77      0.77      1399
weighted avg       0.81      0.82      0.82      1399


Naive Bayes Report:

              precision    recall  f1-score   support

           0       0.71      0.58      0.64       402
           1       0.84      0.90      0.87       997

    accuracy                           0.81      1399
   macro avg       0.78      0.74      0.76      1399
weighted avg       0.80      0.81      0.81      1399



VISUALISATION

In [54]:
#SECTION C: VISUALISATION
# Function to display the plot
def display_plot(fig):
    fig.show()

In [55]:
# 1. Bar Chart - Top 20 Frequent Words
word_freq = Counter(" ".join(df['Cleaned_Review']).split()).most_common(20)
fig = px.bar(pd.DataFrame(word_freq, columns=['Word', 'Frequency']),
             x='Word', y='Frequency', title='Top 20 Frequent Words')
display_plot(fig)

# 2. Pie Chart - Sentiment Proportion
sent_counts = df['Sentiment'].value_counts().reset_index()
sent_counts.columns = ['Sentiment', 'Count']
fig = px.pie(sent_counts, names='Sentiment', values='Count',
             title='Sentiment Proportions')
display_plot(fig)

# 3. Box Plot - Rating by Sentiment
fig = px.box(df, x='Sentiment', y='Rating(Out of 10)',
             title='Box Plot: Ratings by Sentiment')
display_plot(fig)

# 4. Violin Plot - Review Length by Sentiment
fig = px.violin(df, x='Sentiment', y='Review_Length', box=True,
                title='Violin Plot: Review Length by Sentiment')
display_plot(fig)

# 5. Line Plot - Monthly Review Trend
monthly = df.groupby(df['Review_Date'].dt.to_period('M')).size().reset_index(name='Count')
monthly['Review_Date'] = monthly['Review_Date'].astype(str)
fig = px.line(monthly, x='Review_Date', y='Count',
              title='Line Chart: Monthly Review Trend')
display_plot(fig)


In [56]:
# 6. Scatter Plot - Review Length vs Rating
fig = px.scatter(df, x='Review_Length', y='Rating(Out of 10)',
                 trendline='ols', title='Scatter: Review Length vs Rating')
display_plot(fig)

# 7. Histogram - Review Length
fig = px.histogram(df, x='Review_Length', nbins=50,
                   title='Histogram: Review Length Distribution')
display_plot(fig)

# 8. Sunburst Plot - Area and Star Level
fig = px.sunburst(df, path=['Area', 'Star_Level'],
                  title='Sunburst: Area and Star Level')
display_plot(fig)

# 9. Funnel Chart - Star Levels by Sentiment
star_sent = df.groupby('Star_Level')['Sentiment'].mean().reset_index()
fig = px.funnel(star_sent, x='Sentiment', y='Star_Level',
               title='Funnel: Sentiment by Star Level')
display_plot(fig)

# 10. Treemap - Hotel vs Area
hotel_area = df.groupby(['Area', 'Name']).size().reset_index(name='Count')
fig = px.treemap(hotel_area, path=['Area', 'Name'], values='Count',
                 title='Treemap: Hotel Count by Area')
display_plot(fig)









In [57]:
# 11. Donut Chart - Star Level Distribution
star_counts = df['Star_Level'].value_counts().reset_index()
star_counts.columns = ['Star_Level', 'Count']
fig = px.pie(star_counts, names='Star_Level', values='Count', hole=0.4,
             title='Donut: Star Level Distribution')
display_plot(fig)

# 12. Area Chart - Monthly Sentiment Trend
monthly_sent = df.groupby([df['Review_Date'].dt.to_period('M'), 'Sentiment']).size().unstack().fillna(0)
monthly_sent.index = monthly_sent.index.astype(str)
fig = px.area(monthly_sent, title='Area Chart: Sentiment Over Time')
display_plot(fig)

# 13. Bubble Chart - Review Length, Rating, Sentiment
fig = px.scatter(df, x='Review_Length', y='Rating(Out of 10)',
                 size='Rating(Out of 10)', color='Sentiment',
                 title='Bubble Chart: Length vs Rating')
display_plot(fig)

# 14. Heatmap - Correlation Matrix
fig = px.imshow(df[["Rating(Out of 10)", "Sentiment", "Review_Length"]].corr(),
                text_auto=True, title='Heatmap: Feature Correlation')
display_plot(fig)

# 15. Strip Plot - Star Level vs Rating
fig = px.strip(df, x='Star_Level', y='Rating(Out of 10)',
               title='Strip: Star Level vs Rating')
display_plot(fig)


In [58]:
# 16. Histogram - Rating per Area
fig = px.histogram(df, x='Area', color='Sentiment',
                   title='Histogram: Rating Distribution by Area')
display_plot(fig)

# 17. Stacked Bar - Sentiment per Hotel
top_sent = df.groupby(['Name', 'Sentiment']).size().unstack().fillna(0).nlargest(10, 1).reset_index()
fig = px.bar(top_sent, x='Name', y=[0, 1],
             title='Stacked Bar: Sentiment by Hotel')
display_plot(fig)

# 18. Line Plot - Avg Rating Over Time
monthly_rating = df.groupby(df['Review_Date'].dt.to_period('M'))['Rating(Out of 10)'].mean().reset_index()
monthly_rating['Review_Date'] = monthly_rating['Review_Date'].astype(str)
fig = px.line(monthly_rating, x='Review_Date', y='Rating(Out of 10)',
              title='Line: Average Rating Over Time')
display_plot(fig)

# 19. ECDF - Empirical CDF of Ratings
fig = px.ecdf(df, x='Rating(Out of 10)', title='ECDF: Rating Distribution')
display_plot(fig)

# 20. Density Contour - Length vs Rating
fig = px.density_contour(df, x='Review_Length', y='Rating(Out of 10)',
                         title='Density Contour: Length vs Rating')
display_plot(fig)

# 21. Parallel Coordinates - Normalized Comparison
fig = px.parallel_coordinates(df[['Rating(Out of 10)', 'Review_Length', 'Sentiment']],
                              color='Sentiment', title='Parallel Coordinates')
display_plot(fig)
