## Install libraries

In [None]:
!pip install SnowNLP
from snownlp import SnowNLP

In [None]:
!pip install jieba textblob nltk
from textblob import TextBlob
import jieba
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import re
from wordcloud import WordCloud

In [None]:
from collections import Counter

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
!pip install plotly
import plotly.express as px

In [None]:
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

In [None]:
import plotly.graph_objects as go

## Read the data

In [None]:
restaurants_df = pd.read_csv('/content/18_district.csv')
reviews_df = pd.read_csv('/content/hk_reviews_2025.csv')

In [None]:
restaurants_df

In [None]:
restaurants_df = restaurants_df[restaurants_df['Admin_District'] != 'Unknown'] # to clean the "Unknown"

In [None]:
reviews_df

In [None]:
reviews_df['text'] = reviews_df['text'].astype(str)

In [None]:
cleaned_reviews_df = reviews_df[(reviews_df['text'].notna()) & (reviews_df['text'] != '') & (reviews_df['text'] != 'nan')& (reviews_df['restaurant_food'].notna())]

In [None]:
cleaned_reviews_df

In [None]:
merged_df = pd.merge(restaurants_df, cleaned_reviews_df, on='StoreId', how='inner')
#only care about the restaurant having comments

In [None]:
merged_df

## Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
jieba.add_word("好味", freq=100, tag="adj")
jieba.add_word("好好", freq=100, tag="adj")
jieba.add_word("正", freq=100, tag="adj")
jieba.add_word("靚", freq=100, tag="adj")
jieba.add_word("開心", freq=100, tag="adj")
jieba.add_word("勁", freq=100, tag="adj")
jieba.add_word("夠鑊氣", freq=100, tag="adj")
jieba.add_word("滿意", freq=100, tag="adj")
jieba.add_word("差", freq=100, tag="adj")
jieba.add_word("貴", freq=100, tag="adj")

In [None]:
cantonese_lexicon = {
    "好味": 1, "好好": 1, "正": 1, "靚": 0.8,
    "開心": 1, "勁": 0.8, "夠鑊氣": 0.5, "抵食": 1, "好水準": 1, "差": -1, "貴": -0.5,
    "燶": -0.8, "唔會": -1, "唔": -1
}

In [None]:
def analyze_sentiment(text):
    text = str(text)
    chinese_part = ''.join(re.findall(r'[\u4e00-\u9fff]+', text))
    english_part = ''.join(re.findall(r'[a-zA-Z\s]+', text))
    polarity = 0
    weight_total = 0

    def get_punctuation_intensity(text):
        intensity = 1.0
        if "!!" in text or "！" in text:
            intensity *= 1.3
        if "??" in text or "？？" in text:
            intensity *= 1.2
        if "…" in text:
            intensity *= 1.1
        return min(intensity, 2.0)

    if english_part:
        blob = TextBlob(english_part)
        eng_polarity = blob.sentiment.polarity
        eng_intensity = get_punctuation_intensity(text)
        eng_polarity *= eng_intensity
        weight_eng = len(english_part) / (len(text) + 1e-5)
        polarity += eng_polarity * weight_eng
        weight_total += weight_eng

    if chinese_part:
        tokens = jieba.cut(chinese_part)
        tokenized_text = " ".join(tokens)

        vader_scores = sid.polarity_scores(tokenized_text)
        chn_polarity = vader_scores["compound"]

        lexicon_score = 0
        for token in tokens:
            if token in cantonese_lexicon:
                score = cantonese_lexicon[token]
                lexicon_score += score

        if lexicon_score != 0:
            chn_polarity = (chn_polarity + lexicon_score) / 2

        if abs(chn_polarity) < 0.1:
            s = SnowNLP(tokenized_text)
            chn_polarity = (s.sentiments - 0.5) * 2

        chn_intensity = get_punctuation_intensity(text)
        chn_polarity *= chn_intensity

        weight_chn = len(chinese_part) / (len(text) + 1e-5)
        polarity += chn_polarity * weight_chn
        weight_total += weight_chn

    if weight_total > 0:
        polarity /= weight_total

    polarity = max(min(polarity, 1.0), -1.0)

    if polarity > 0.05:
        sentiment = "positive"
    elif polarity < -0.05:
        sentiment = "negative"
    else:
        sentiment = "neutral"

    return sentiment, polarity

## Apply to our data

In [None]:
merged_df[['sentiment', 'polarity']] = merged_df['text'].apply(
    lambda x: pd.Series(analyze_sentiment(x))
)

In [None]:
sentiment_summary = merged_df.groupby('Admin_District').agg({
    'AverageRating': 'mean',
    'polarity': 'mean',
    'sentiment': lambda x: x.value_counts().to_dict()
}).reset_index()
#18_districts only

In [None]:
pd.set_option('display.max_colwidth', None)
print(sentiment_summary)

In [None]:
merged_df.columns

In [None]:
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(sentiment_summary['AverageRating'], sentiment_summary['polarity'])
regression_line = slope * sentiment_summary['AverageRating'] + intercept


In [None]:
fig = px.scatter(sentiment_summary,
                 x='AverageRating',
                 y='polarity',
                 text='Admin_District',
                 title='District Ratings vs Sentiment Scores',
                 labels={'AverageRating': 'Average Rating', 'polarity': 'Average Sentiment Score'})
fig.update_traces(textposition='top center',marker=dict(size=10, opacity=0.7, line=dict(width=1, color='white')))
fig.update_layout(
    width=1000,
    height=600,
    showlegend=False,
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    font=dict(size=12),
    margin=dict(l=50, r=50, t=80, b=50)
)
fig.show()

## Check validity

In [None]:
all_reviews = ' '.join(merged_df['text'])

chinese_words = re.findall(r'[\u4e00-\u9fff]+', all_reviews)

word_counts = Counter(chinese_words)

word_counts_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency'])

word_counts_df = word_counts_df.sort_values(by='Frequency', ascending=False)

top_chinese_words = word_counts_df.head(20)
print(top_chinese_words)

In [None]:
english_words = re.findall(r'[a-zA-Z]+', all_reviews)

english_word_counts = Counter(english_words)

english_word_counts_df = pd.DataFrame(english_word_counts.items(), columns=['Word', 'Frequency'])
english_word_counts_df = english_word_counts_df.sort_values(by='Frequency', ascending=False)

top_english_words = english_word_counts_df.head(20)
print(top_english_words)

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
    return text
all_reviews = ' '.join(merged_df['text'].apply(preprocess_text))

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off the axis
plt.title('Word Cloud of Reviews')
plt.show()

In [None]:
merged_df[["sentiment", "polarity"]] = merged_df["text"].apply(
    lambda x: pd.Series(analyze_sentiment(x))
)
# Dictionaries to store reviews by sentiment levels
basic_levels = {"positive": [], "negative": [], "neutral": []}
finer_levels = {
    "very positive": [],  # > 0.3
    "slightly positive": [],  # 0.05 to 0.3
    "neutral": [],  # -0.05 to 0.05
    "slightly negative": [],  # -0.3 to -0.05
    "very negative": []  # < -0.3
}

# Categorize reviews
for index, row in merged_df.iterrows():
    review = row["text"]
    sentiment = row["sentiment"]
    polarity = row["polarity"]

    # Basic levels
    basic_levels[sentiment].append((review, polarity))

    # Finer levels
    if polarity > 0.3:
        finer_levels["very positive"].append((review, polarity))
    elif 0.05 < polarity <= 0.3:
        finer_levels["slightly positive"].append((review, polarity))
    elif -0.05 <= polarity <= 0.05:
        finer_levels["neutral"].append((review, polarity))
    elif -0.3 <= polarity < -0.05:
        finer_levels["slightly negative"].append((review, polarity))
    else:
        finer_levels["very negative"].append((review, polarity))

# Print results for validation and by levels
print("=== Checking Validity of Each Comment in merged_df ===")
for index, row in merged_df.iterrows():
    print(f"Review: {row['text']}")
    print(f"Sentiment: {row['sentiment']}, Polarity: {row['polarity']:.2f}")
    print()

print("=== Basic Sentiment Levels ===")
for level, reviews in basic_levels.items():
    print(f"\n{level.capitalize()} Reviews ({len(reviews)}):")
    for review, polarity in reviews:
        print(f" - {review} (Polarity: {polarity:.2f})")

print("=== Finer Sentiment Levels (Polarity as Sentiment Level) ===")
for level, reviews in finer_levels.items():
    print(f"\n{level.capitalize()} Reviews ({len(reviews)}):")
    for review, polarity in reviews:
        print(f" - {review} (Polarity: {polarity:.2f})")

## Statistical Analysis: Are they in the relationship?

### 1. Some brief graphs in general

In [None]:
avg_rating_by_food_type = merged_df.groupby('FoodType')['AverageRating'].mean().reset_index()

fig = px.bar(avg_rating_by_food_type, x='FoodType', y='AverageRating',
             title='Average Rating by Food Type',
             labels={'AverageRating': 'Average Rating', 'FoodType': 'Food Type'})
fig.show()

In [None]:
review_count_per_district = merged_df.groupby('Admin_District')['text'].count().reset_index()
review_count_per_district.rename(columns={'text': 'ReviewCount'}, inplace=True)

fig = px.bar(review_count_per_district, x='Admin_District', y='ReviewCount',
             title='Review Count by District',
             labels={'ReviewCount': 'Number of Reviews', 'Admin_District': 'District'})
fig.show()

In [None]:
sentiment_counts = merged_df['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']

fig = px.pie(sentiment_counts, values='Count', names='Sentiment',
             title='Sentiment Distribution of Restaurant Reviews')
fig.show()

In [None]:
fig = px.scatter(merged_df, x='polarity', y='AverageRating',
                 title='Polarity vs. Average Rating',
                 labels={'polarity': 'Sentiment Polarity', 'AverageRating': 'Average Rating'},
                 trendline='ols')
fig.show()

In [None]:
top_restaurants = merged_df[['StoreId', 'CompleteStoreName', 'AverageRating']].sort_values('AverageRating', ascending=False).head(100)

fig = px.bar(top_restaurants, x='CompleteStoreName', y='AverageRating',
             title='Top 10 Restaurants by Average Rating',
             labels={'AverageRating': 'Average Rating', 'CompleteStoreName': 'Restaurant Name'})
fig.show()

### 2. Further insigns on Admin_Districts

In [None]:
# Calculate the total number of reviews by district
review_count_by_district = merged_df.groupby('Admin_District')['text'].count().reset_index()
review_count_by_district.rename(columns={'text': 'ReviewCount'}, inplace=True)

# Sort the DataFrame by ReviewCount in descending order (highest to lowest)
review_count_by_district = review_count_by_district.sort_values(by='ReviewCount', ascending=True)

# Create a bar chart using Plotly Express
fig = px.bar(
    review_count_by_district,
    y='Admin_District',  # Use y for districts to make the bars horizontal
    x='ReviewCount',     # Use x for the number of reviews to represent bar length
    orientation='h',     # Horizontal orientation
    title='Total Number of Reviews by District',
    labels={'ReviewCount': 'Number of Reviews', 'Admin_District': 'District'},
    color='ReviewCount',  # Color the bars based on the number of reviews
    color_continuous_scale=px.colors.sequential.Plasma  # Use the Plasma color scale
)

# Update layout for better visualization
fig.update_layout(
    height=600,
    margin=dict(l=200),  # Add margin to prevent y-axis labels from being cut off
    coloraxis_colorbar=dict(
        title='Number of Reviews'
    )
)

# Add annotations to show the exact number of reviews on the bars (optional)
for i, row in review_count_by_district.iterrows():
    review_count = row['ReviewCount']
    fig.add_annotation(
        y=row['Admin_District'],
        x=review_count,
        text=f"{int(review_count)}",  # Display as integer
        showarrow=False,
        xshift=30,
        font=dict(size=10, color='black')
    )

# Show the figure
fig.show()

In [None]:
sentiment_distribution = merged_df.groupby(['Admin_District', 'sentiment']).size().reset_index(name='Count')

fig = px.bar(sentiment_distribution, x='Admin_District', y='Count', color='sentiment',
             title='Sentiment Distribution by District',
             labels={'Count': 'Number of Reviews', 'Admin_District': 'District'})
fig.show()

In [None]:
# Group the data to get the count of reviews by district and sentiment
sentiment_distribution = merged_df.groupby(['Admin_District', 'sentiment']).size().reset_index(name='Count')

# Pivot the data to get counts for each sentiment per district
pivot_df = sentiment_distribution.pivot(index='Admin_District', columns='sentiment', values='Count').fillna(0)

# Calculate the total reviews per district
pivot_df['Total'] = pivot_df.sum(axis=1)

# Calculate the percentage of positive sentiment for each district
pivot_df['Positive_Percentage'] = (pivot_df.get('positive', 0) / pivot_df['Total'] * 100).round(2)

# Reset index to use Admin_District as a column
pivot_df = pivot_df.reset_index()

# Sort the DataFrame by Positive_Percentage in descending order (highest to lowest)
pivot_df = pivot_df.sort_values(by='Positive_Percentage', ascending=True)

# Create a bar chart using plotly.graph_objects
fig = go.Figure()

# Define a color scale (red to green for positive sentiment percentage)
color_scale = [
    [0, 'red'],      # Lower positive sentiment -> red
    [0.5, 'yellow'], # Middle positive sentiment -> yellow
    [1, 'green']     # Higher positive sentiment -> green
]

# Normalize the positive percentage to a 0-1 scale for the color mapping
min_percentage = pivot_df['Positive_Percentage'].min()
max_percentage = pivot_df['Positive_Percentage'].max()
normalized_percentages = (pivot_df['Positive_Percentage'] - min_percentage) / (max_percentage - min_percentage)

# Add a bar for each district, with the length representing the positive sentiment percentage
fig.add_trace(go.Bar(
    y=pivot_df['Admin_District'],
    x=pivot_df['Positive_Percentage'],
    orientation='h',  # Horizontal bars
    marker=dict(
        color=normalized_percentages,  # Use the normalized percentage for coloring
        colorscale=color_scale,       # Apply the custom color scale
        showscale=True,               # Show the color bar
        colorbar=dict(
            title='Positive Sentiment %',
            tickvals=[0, 0.5, 1],
            ticktext=[f'{min_percentage:.2f}%', f'{(min_percentage + max_percentage)/2:.2f}%', f'{max_percentage:.2f}%']
        )
    )
))

# Update layout to focus on positive sentiment percentage
fig.update_layout(
    title='Positive Sentiment Percentage by District',
    yaxis_title='District',
    xaxis_title='Positive Sentiment Percentage (%)',
    xaxis=dict(range=[0, 100]),  # Set x-axis range from 0% to 100%
    height=600,
    margin=dict(l=200),
)

# Add annotations to show the exact percentage on the bars
for i, row in pivot_df.iterrows():
    positive_percentage = row['Positive_Percentage']
    fig.add_annotation(
        y=row['Admin_District'],
        x=positive_percentage,
        text=f"{positive_percentage}%",
        showarrow=False,
        xshift=30,
        font=dict(size=10, color='black')
    )

# Show the figure
fig.show()

In [None]:
# Calculate the average polarity by district
avg_polarity_by_district = merged_df.groupby('Admin_District')['polarity'].mean().reset_index()

# Sort the DataFrame by polarity in descending order (highest to lowest)
avg_polarity_by_district = avg_polarity_by_district.sort_values(by='polarity', ascending=True)

# Create a bar chart using Plotly Express
fig = px.bar(
    avg_polarity_by_district,
    y='Admin_District',  # Use y for districts to make the bars horizontal
    x='polarity',        # Use x for polarity to represent bar length
    orientation='h',     # Horizontal orientation
    title='Average Sentiment Polarity by District',
    labels={'polarity': 'Average Polarity', 'Admin_District': 'District'},
    color='polarity',    # Color the bars based on polarity
    color_continuous_scale=px.colors.sequential.Inferno  # Use the Inferno color scale
)

# Update layout for better visualization
fig.update_layout(
    height=600,
    margin=dict(l=200),  # Add margin to prevent y-axis labels from being cut off
    xaxis=dict(range=[-1, 1]),  # Assuming polarity ranges from -1 to 1; adjust if needed
    coloraxis_colorbar=dict(
        title='Average Polarity',
        tickvals=[-1, 0, 1],  # Adjust based on the actual range of polarity
        ticktext=['-1 (Negative)', '0 (Neutral)', '1 (Positive)']
    )
)

# Add annotations to show the exact polarity on the bars
for i, row in avg_polarity_by_district.iterrows():
    polarity = row['polarity']
    fig.add_annotation(
        y=row['Admin_District'],
        x=polarity,
        text=f"{polarity:.2f}",
        showarrow=False,
        xshift=30 if polarity >= 0 else -30,  # Shift right for positive, left for negative
        font=dict(size=10, color='black')
    )

# Show the figure
fig.show()

### 3. Regression equation

In [None]:
correlation, p_value = stats.pearsonr(sentiment_summary['AverageRating'], sentiment_summary['polarity'])
print(f"Pearson correlation coefficient: {correlation:.3f}")
print(f"P-value: {p_value:.3f}")
if p_value < 0.05:
    print("The correlation is statistically significant (p < 0.05)")
else:
    print("The correlation is not statistically significant (p >= 0.05)")

#### by the groups (each districts)

In [None]:
sentiment_summary

In [None]:
X1 = sentiment_summary['AverageRating'].values.reshape(-1, 1)  # Independent variable (AverageRating)
y1 = sentiment_summary['polarity'].values  # Dependent variable (polarity)
model = LinearRegression()
model.fit(X1, y1)
slope = model.coef_[0]  # Slope
intercept = model.intercept_  # Intercept
r_squared = model.score(X1, y1)  # R² value
print(f"\nRegression equation: polarity = {slope:.3f} * AverageRating + {intercept:.3f}")
print(f"R² value: {r_squared:.3f}")

In [None]:
regression_line = model.predict(X1)

fig = px.scatter(sentiment_summary,
                 x='AverageRating',
                 y='polarity',
                 text='Admin_District',
                 title='District Ratings vs Sentiment Scores',
                 labels={'AverageRating': 'Average Rating', 'polarity': 'Average Sentiment Score'})

fig.add_scatter(x=sentiment_summary['AverageRating'], y=regression_line, mode='lines', name='Regression Line', line=dict(color='red'))

fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.7, line=dict(width=1, color='white')))
fig.update_layout(
    width=1000,
    height=600,
    showlegend=True,
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    font=dict(size=12),
    margin=dict(l=50, r=50, t=80, b=50)
)

fig.show()

#### by individual (each unique id )

In [None]:
merged_df

In [None]:
merged_df.groupby("reviewerId")["polarity"].mean()

In [None]:
merged_df.groupby("reviewerId")["restaurant_food"].mean()

In [None]:
X2 = merged_df.groupby("reviewerId")["polarity"].mean().values.reshape(-1, 1)  # Independent variable (AverageRating)
y2 = merged_df.groupby("reviewerId")["restaurant_food"].mean().values  # Dependent variable (polarity)
model = LinearRegression()
model.fit(X2, y2)
slope = model.coef_[0]  # Slope
intercept = model.intercept_  # Intercept
r_squared = model.score(X2, y2)  # R² value
print(f"\nRegression equation: polarity = {slope:.3f} * AverageRating + {intercept:.3f}")
print(f"R² value: {r_squared:.3f}")

### 4. Metrices (more variables)(optional)

In [None]:
merged_df.columns

In [None]:
X3 = merged_df[["AverageRating", "restaurant_food"]]  # Independent variables
y3 = merged_df["polarity"]  # Dependent variable
model = LinearRegression()
model.fit(X3, y3)

In [None]:
coef_avg_rating = model.coef_[0]
coef_restaurant_food = model.coef_[1]
intercept = model.intercept_
print(f"Regression Equation: polarity = {coef_avg_rating:.3f} * AverageRating + {coef_restaurant_food:.3f} * restaurant_food + {intercept:.3f}")

In [None]:
r_squared = model.score(X3, y3)
r_squared

In [None]:
district_summary = merged_df.groupby("Admin_District").agg({
    "polarity": "mean",
    "AverageRating": "mean",
    "restaurant_food": "mean",
    "Reviewers": "mean"
}).reset_index()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(district_summary.set_index("Admin_District"), annot=True, cmap="YlGnBu", fmt=".2f")
plt.title("Heatmap of Polarity and Predictors by Admin_District")
plt.show()