In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DoubleType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from wordcloud import WordCloud
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set chart style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Initialize Spark Session and load cleaned data
spark = SparkSession.builder \
    .appName("TweetAnalysis_EDA_Sentiment") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

# Load cleaned data (based on output path from step 2)
cleaned_data_path = "/home/jovyan/work/data/processed/cleaned_comments.parquet"
print(f"Attempting to load data from: {cleaned_data_path}")

try:
    df_cleaned = spark.read.parquet(cleaned_data_path)
    df_cleaned.cache()
    record_count = df_cleaned.count()
    print(f"✅ Cleaned data loaded successfully, total {record_count:,} records")
    print("\nData structure:")
    df_cleaned.printSchema()
    
    # Display preview of first few rows
    print("\nData preview (first 3 rows):")
    df_cleaned.select("id", "`subreddit.name`", "cleaned_body", "sentiment").show(3, truncate=False)
    
except Exception as e:
    print(f"❌ Data loading failed: {e}")
    print("Please ensure you have completed the data cleaning notebook from step 2")
    # Create an empty DataFrame as fallback
    df_cleaned = None


Spark Version: 3.5.0
Attempting to load data from: /home/jovyan/work/data/processed/cleaned_comments.parquet
✅ Cleaned data loaded successfully, total 459,171 records

Data structure:
root
 |-- id: string (nullable = true)
 |-- subreddit.name: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- body: string (nullable = true)
 |-- cleaned_body: string (nullable = true)
 |-- tokens_cleaned: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentiment: double (nullable = true)
 |-- score: long (nullable = true)


Data preview (first 3 rows):
+-------+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
# Basic data overview and statistics
print("=== Basic Data Overview ===")

# Verify if data was loaded successfully
if df_cleaned is None:
    print("❌ Data not loaded, please run the previous cell and ensure data loads successfully")
else:
    # 1. Basic data information
    print("1. Basic data information:")
    print(f"Total records: {df_cleaned.count():,}")
    print(f"Total columns: {len(df_cleaned.columns)}")

    # 2. Data completeness check for each column
    print("\n2. Data completeness:")
    for col_name in df_cleaned.columns:
        # Handle column names containing dots
        if "." in col_name:
            null_count = df_cleaned.filter(F.col(f"`{col_name}`").isNull()).count()
        else:
            null_count = df_cleaned.filter(F.col(col_name).isNull()).count()
        total_count = df_cleaned.count()
        print(f"  {col_name}: {((total_count - null_count) / total_count * 100):.1f}% complete")

    # 3. Sentiment score statistics
    print("\n3. Sentiment score distribution:")
    sentiment_stats = df_cleaned.select("sentiment").describe()
    sentiment_stats.show()

    # 4. Subreddit distribution
    print("4. Subreddit distribution (Top 10):")
    subreddit_dist = df_cleaned.groupBy("`subreddit.name`").count().orderBy(F.desc("count")).limit(10)
    subreddit_dist.show(truncate=False)


=== Basic Data Overview ===
1. Basic data information:
Total records: 459,171
Total columns: 9

2. Data completeness:
  id: 100.0% complete
  subreddit.name: 100.0% complete
  created_utc: 100.0% complete
  timestamp: 100.0% complete
  body: 100.0% complete
  cleaned_body: 100.0% complete
  tokens_cleaned: 100.0% complete
  sentiment: 98.9% complete
  score: 100.0% complete

3. Sentiment score distribution:
+-------+--------------------+
|summary|           sentiment|
+-------+--------------------+
|  count|              454264|
|   mean|-0.00644086346265...|
| stddev|  0.6588389261269937|
|    min|             -0.9999|
|    max|              0.9999|
+-------+--------------------+

4. Subreddit distribution (Top 10):
+--------------+-----+
|subreddit.name|count|
+--------------+-----+
|politics      |36989|
|worldnews     |35283|
|askreddit     |25863|
|news          |9524 |
|collapse      |9490 |
|futurology    |8904 |
|science       |7063 |
|environment   |6819 |
|canada        |6722

In [None]:
# Re-analyze sentiment using VADER (improve original sentiment scores)
print("=== VADER Sentiment Analysis ===")

# Verify if data is available
if df_cleaned is None:
    print("❌ Data not loaded, please run previous cells and ensure data loads successfully")
else:
    # Create VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    def analyze_sentiment_vader(text):
        """Perform sentiment analysis using VADER"""
        if text is None:
            return None
        scores = analyzer.polarity_scores(str(text))
        return scores['compound']  # Return compound sentiment score (-1 to 1)

    # Create UDF for Spark
    sentiment_udf = F.udf(analyze_sentiment_vader, DoubleType())

    # Apply VADER sentiment analysis to cleaned text
    print("Applying VADER sentiment analysis...")
    df_with_vader = df_cleaned.withColumn("vader_sentiment", sentiment_udf(F.col("cleaned_body")))

    # Cache results
    df_with_vader.cache()

    print("VADER sentiment analysis completed!")

    # Compare original sentiment scores vs VADER scores
    print("\nOriginal sentiment score vs VADER score comparison:")
    comparison = df_with_vader.select("sentiment", "vader_sentiment").filter(
        F.col("sentiment").isNotNull() & F.col("vader_sentiment").isNotNull()
    ).limit(10)
    comparison.show()

    # VADER sentiment score statistics
    print("\nVADER sentiment score statistics:")
    vader_stats = df_with_vader.select("vader_sentiment").describe()
    vader_stats.show()


=== VADER Sentiment Analysis ===
Applying VADER sentiment analysis...
VADER sentiment analysis completed!

Original sentiment score vs VADER score comparison:


In [None]:
# Sentiment distribution analysis and visualization
print("=== Sentiment Distribution Analysis ===")

# Verify if VADER analysis is completed
try:
    df_with_vader
    print("Preparing visualization data...")
    
    # Create sentiment classification
    def categorize_sentiment(score):
        if score is None:
            return "Unknown"
        elif score > 0.05:
            return "Positive"
        elif score < -0.05:
            return "Negative"
        else:
            return "Neutral"

    categorize_udf = F.udf(categorize_sentiment, StringType())

    # Apply sentiment classification
    df_categorized = df_with_vader.withColumn("sentiment_category", categorize_udf(F.col("vader_sentiment")))

    # Convert to Pandas for analysis
    sentiment_dist = df_categorized.groupBy("sentiment_category").count().toPandas()
    print("\nSentiment distribution:")
    print(sentiment_dist)

    # Plot sentiment distribution pie chart
    plt.figure(figsize=(10, 6))

    # Pie chart
    plt.subplot(1, 2, 1)
    colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
    plt.pie(sentiment_dist['count'], labels=sentiment_dist['sentiment_category'], 
            autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Climate Change Comment Sentiment Distribution')

    # Bar chart
    plt.subplot(1, 2, 2)
    bars = plt.bar(sentiment_dist['sentiment_category'], sentiment_dist['count'], color=colors)
    plt.title('Comment Count by Sentiment Category')
    plt.xlabel('Sentiment Category')
    plt.ylabel('Comment Count')
    plt.xticks(rotation=45)

    # Add value labels on bar chart
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height):,}', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

    print("Sentiment distribution visualization completed!")
    
except NameError:
    print("❌ VADER sentiment analysis not completed, please run previous cells")


In [None]:
# 子版块sentiment analysis
print("=== 子版块sentiment analysis ===")

# verifying情感classificationdata是否可用
try:
    df_categorized
    
    # Calculating各子版块的平均情感分数
    subreddit_sentiment = df_categorized.groupBy("`subreddit.name`").agg(
        F.count("*").alias("comment_count"),
        F.avg("vader_sentiment").alias("avg_sentiment"),
        F.sum(F.when(F.col("sentiment_category") == "Positive", 1).otherwise(0)).alias("positive_count"),
        F.sum(F.when(F.col("sentiment_category") == "Negative", 1).otherwise(0)).alias("negative_count"),
        F.sum(F.when(F.col("sentiment_category") == "Neutral", 1).otherwise(0)).alias("neutral_count")
    ).filter(F.col("comment_count") >= 1000).orderBy(F.desc("comment_count"))

    # converting为Pandas并Displaying
    subreddit_sentiment_pd = subreddit_sentiment.toPandas()
    print("main子版块sentiment analysisresults:")
    print(subreddit_sentiment_pd.head(10))

    # 绘制子版块情感对比图
    plt.figure(figsize=(15, 8))

    # selecting前10个最活跃的子版块
    top_subreddits = subreddit_sentiment_pd.head(10)

    # 子版块平均情感分数
    plt.subplot(2, 1, 1)
    bars = plt.bar(range(len(top_subreddits)), top_subreddits['avg_sentiment'])
    plt.title('main子版块平均情感分数')
    plt.xlabel('子版块')
    plt.ylabel('平均情感分数')
    plt.xticks(range(len(top_subreddits)), top_subreddits['subreddit.name'], rotation=45, ha='right')
    plt.axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Neutral线')
    plt.legend()

    # 为柱状图adding颜色（正面绿色，负面红色，Neutral灰色）
    for i, bar in enumerate(bars):
        if top_subreddits.iloc[i]['avg_sentiment'] > 0.05:
            bar.set_color('green')
        elif top_subreddits.iloc[i]['avg_sentiment'] < -0.05:
            bar.set_color('red')
        else:
            bar.set_color('gray')

    # 子版块评论数量
    plt.subplot(2, 1, 2)
    plt.bar(range(len(top_subreddits)), top_subreddits['comment_count'], color='skyblue')
    plt.title('main子版块评论数量')
    plt.xlabel('子版块')
    plt.ylabel('评论数量')
    plt.xticks(range(len(top_subreddits)), top_subreddits['subreddit.name'], rotation=45, ha='right')

    plt.tight_layout()
    plt.show()

    print("子版块sentiment analysis可视化completing！")
    
except NameError:
    print("❌ 情感classificationdata未Preparing好，请先running前面的cells")


In [None]:
# 时间序列sentiment analysis
print("=== 时间序列sentiment analysis ===")

# verifying情感classificationdata是否可用
try:
    df_categorized
    
    # 从timestamp中提取年份
    df_with_year = df_categorized.withColumn("year", F.year(F.col("timestamp")))
    
    # 按年份Analyzing情感趋势
    yearly_sentiment = df_with_year.groupBy("year").agg(
        F.count("*").alias("total_comments"),
        F.avg("vader_sentiment").alias("avg_sentiment"),
        F.sum(F.when(F.col("sentiment_category") == "Positive", 1).otherwise(0)).alias("positive_count"),
        F.sum(F.when(F.col("sentiment_category") == "Negative", 1).otherwise(0)).alias("negative_count"),
        F.sum(F.when(F.col("sentiment_category") == "Neutral", 1).otherwise(0)).alias("neutral_count")
    ).orderBy("year")
    
    # converting为Pandas进行可视化
    yearly_sentiment_pd = yearly_sentiment.toPandas()
    print("年度情感趋势Analyzing:")
    print(yearly_sentiment_pd)
    
    # 绘制时间序列图
    plt.figure(figsize=(15, 10))
    
    # 年度评论数量趋势
    plt.subplot(2, 2, 1)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['total_comments'], marker='o', linewidth=2)
    plt.title('年度评论数量趋势')
    plt.xlabel('年份')
    plt.ylabel('评论数量')
    plt.grid(True, alpha=0.3)
    
    # 年度平均情感趋势
    plt.subplot(2, 2, 2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['avg_sentiment'], 
             marker='o', linewidth=2, color='orange')
    plt.title('年度平均情感趋势')
    plt.xlabel('年份')
    plt.ylabel('平均情感分数')
    plt.axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Neutral线')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 情感class别年度分布
    plt.subplot(2, 2, 3)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['positive_count'], 
             marker='o', label='正面', color='green', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['negative_count'], 
             marker='s', label='负面', color='red', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['neutral_count'], 
             marker='^', label='Neutral', color='gray', linewidth=2)
    plt.title('年度情感class别分布')
    plt.xlabel('年份')
    plt.ylabel('评论数量')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 情感比例年度变化
    plt.subplot(2, 2, 4)
    yearly_sentiment_pd['positive_ratio'] = yearly_sentiment_pd['positive_count'] / yearly_sentiment_pd['total_comments']
    yearly_sentiment_pd['negative_ratio'] = yearly_sentiment_pd['negative_count'] / yearly_sentiment_pd['total_comments']
    yearly_sentiment_pd['neutral_ratio'] = yearly_sentiment_pd['neutral_count'] / yearly_sentiment_pd['total_comments']
    
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['positive_ratio'], 
             marker='o', label='正面比例', color='green', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['negative_ratio'], 
             marker='s', label='负面比例', color='red', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['neutral_ratio'], 
             marker='^', label='Neutral比例', color='gray', linewidth=2)
    plt.title('年度情感比例变化')
    plt.xlabel('年份')
    plt.ylabel('比例')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("时间序列sentiment analysiscompleting！")
    
except NameError:
    print("❌ 情感classificationdata未Preparing好，请先running前面的cells")


In [None]:
# Word frequency analysis and word cloud generation
print("=== Word Frequency Analysis and Word Cloud Generation ===")

# Verify if classification data is available
try:
    df_categorized
    
    # Extract all words from tokens_cleaned
    print("Analyzing word frequency...")

    # Explode all tokens and calculate frequency
    all_tokens = df_categorized.select(F.explode(F.col("tokens_cleaned")).alias("token"))
    word_freq = all_tokens.groupBy("token").count().orderBy(F.desc("count"))

    # Get top 50 high-frequency words
    top_words = word_freq.limit(50).toPandas()
    print("Top 20 high-frequency words:")
    print(top_words.head(20))

    # Analyze word frequency for different sentiment categories separately
    print("\nAnalyzing word frequency for different sentiment categories...")

    # Word frequency for positive comments
    positive_tokens = df_categorized.filter(F.col("sentiment_category") == "Positive").select(
        F.explode(F.col("tokens_cleaned")).alias("token")
    )
    positive_word_freq = positive_tokens.groupBy("token").count().orderBy(F.desc("count")).limit(30).toPandas()

    # Word frequency for negative comments
    negative_tokens = df_categorized.filter(F.col("sentiment_category") == "Negative").select(
        F.explode(F.col("tokens_cleaned")).alias("token")
    )
    negative_word_freq = negative_tokens.groupBy("token").count().orderBy(F.desc("count")).limit(30).toPandas()

    # Generate word clouds
    print("Generating word clouds...")

    # Prepare word cloud data
    word_freq_dict = dict(zip(top_words['token'], top_words['count']))

    # Generate overall word cloud
    plt.figure(figsize=(15, 10))

    # Overall word cloud
    plt.subplot(2, 2, 1)
    wordcloud = WordCloud(width=400, height=400, background_color='white', 
                         max_words=100, colormap='viridis').generate_from_frequencies(word_freq_dict)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Overall High-Frequency Word Cloud')
    plt.axis('off')

    # Positive comments word cloud
    plt.subplot(2, 2, 2)
    positive_dict = dict(zip(positive_word_freq['token'], positive_word_freq['count']))
    positive_wordcloud = WordCloud(width=400, height=400, background_color='white',
                                  max_words=50, colormap='Greens').generate_from_frequencies(positive_dict)
    plt.imshow(positive_wordcloud, interpolation='bilinear')
    plt.title('Positive Comments Word Cloud')
    plt.axis('off')

    # Negative comments word cloud
    plt.subplot(2, 2, 3)
    negative_dict = dict(zip(negative_word_freq['token'], negative_word_freq['count']))
    negative_wordcloud = WordCloud(width=400, height=400, background_color='white',
                                  max_words=50, colormap='Reds').generate_from_frequencies(negative_dict)
    plt.imshow(negative_wordcloud, interpolation='bilinear')
    plt.title('Negative Comments Word Cloud')
    plt.axis('off')

    # Word frequency comparison chart
    plt.subplot(2, 2, 4)
    top_15_words = top_words.head(15)
    plt.barh(range(len(top_15_words)), top_15_words['count'])
    plt.yticks(range(len(top_15_words)), top_15_words['token'])
    plt.xlabel('Word Frequency')
    plt.title('Top 15 High-Frequency Words')
    plt.gca().invert_yaxis()

    plt.tight_layout()
    plt.show()
    
    print("Word frequency analysis and word cloud generation completed!")
    
except NameError:
    print("❌ Classification data not ready, please run previous cells")


In [None]:



# 时间序列sentiment analysis
print("=== 时间序列sentiment analysis ===")

# verifying情感classificationdata是否可用
try:
    df_categorized
    
    # 从timestamp中提取年份
    df_with_year = df_categorized.withColumn("year", F.year(F.col("timestamp")))
    
    # 按年份Analyzing情感趋势
    yearly_sentiment = df_with_year.groupBy("year").agg(
        F.count("*").alias("total_comments"),
        F.avg("vader_sentiment").alias("avg_sentiment"),
        F.sum(F.when(F.col("sentiment_category") == "Positive", 1).otherwise(0)).alias("positive_count"),
        F.sum(F.when(F.col("sentiment_category") == "Negative", 1).otherwise(0)).alias("negative_count"),
        F.sum(F.when(F.col("sentiment_category") == "Neutral", 1).otherwise(0)).alias("neutral_count")
    ).orderBy("year")
    
    # converting为Pandas进行可视化
    yearly_sentiment_pd = yearly_sentiment.toPandas()
    print("年度情感趋势Analyzing:")
    print(yearly_sentiment_pd)
    
    # 绘制时间序列图
    plt.figure(figsize=(15, 10))
    
    # 年度评论数量趋势
    plt.subplot(2, 2, 1)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['total_comments'], marker='o', linewidth=2)
    plt.title('年度评论数量趋势')
    plt.xlabel('年份')
    plt.ylabel('评论数量')
    plt.grid(True, alpha=0.3)
    
    # 年度平均情感趋势
    plt.subplot(2, 2, 2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['avg_sentiment'], 
             marker='o', linewidth=2, color='orange')
    plt.title('年度平均情感趋势')
    plt.xlabel('年份')
    plt.ylabel('平均情感分数')
    plt.axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Neutral线')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 情感class别年度分布
    plt.subplot(2, 2, 3)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['positive_count'], 
             marker='o', label='正面', color='green', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['negative_count'], 
             marker='s', label='负面', color='red', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['neutral_count'], 
             marker='^', label='Neutral', color='gray', linewidth=2)
    plt.title('年度情感class别分布')
    plt.xlabel('年份')
    plt.ylabel('评论数量')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 情感比例年度变化
    plt.subplot(2, 2, 4)
    yearly_sentiment_pd['positive_ratio'] = yearly_sentiment_pd['positive_count'] / yearly_sentiment_pd['total_comments']
    yearly_sentiment_pd['negative_ratio'] = yearly_sentiment_pd['negative_count'] / yearly_sentiment_pd['total_comments']
    yearly_sentiment_pd['neutral_ratio'] = yearly_sentiment_pd['neutral_count'] / yearly_sentiment_pd['total_comments']
    
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['positive_ratio'], 
             marker='o', label='正面比例', color='green', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['negative_ratio'], 
             marker='s', label='负面比例', color='red', linewidth=2)
    plt.plot(yearly_sentiment_pd['year'], yearly_sentiment_pd['neutral_ratio'], 
             marker='^', label='Neutral比例', color='gray', linewidth=2)
    plt.title('年度情感比例变化')
    plt.xlabel('年份')
    plt.ylabel('比例')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("时间序列sentiment analysiscompleting！")
    
except NameError:
    print("❌ 情感classificationdata未Preparing好，请先running前面的cells")

In [None]:
# Analysis summary and save results
print("=== Exploratory Data Analysis Summary ===")

# Verify if all necessary data is available
try:
    df_categorized
    subreddit_sentiment_pd
    yearly_sentiment_pd

    # Generate analysis report
    total_comments = df_categorized.count()
    sentiment_dist_final = df_categorized.groupBy("sentiment_category").count().toPandas()

    print("📊 **Data Overview**:")
    print(f"   Total comments: {total_comments:,}")
    print(f"   Analysis time span: 2010-2022")
    print(f"   Main subreddits: politics, worldnews, askreddit")

    print("\n🎭 **Sentiment Distribution**:")
    for _, row in sentiment_dist_final.iterrows():
        percentage = (row['count'] / total_comments) * 100
        print(f"   {row['sentiment_category']}: {row['count']:,} ({percentage:.1f}%)")

    print("\n📈 **Key Findings**:")
    yearly_summary = yearly_sentiment_pd
    if len(yearly_summary) > 0:
        print(f"   Most negative year: {yearly_summary.loc[yearly_summary['avg_sentiment'].idxmin(), 'year']}")
        print(f"   Most positive year: {yearly_summary.loc[yearly_summary['avg_sentiment'].idxmax(), 'year']}")
        print(f"   Year with most comments: {yearly_summary.loc[yearly_summary['total_comments'].idxmax(), 'year']}")

    print("\n🏷️ **Subreddit Characteristics**:")
    if len(subreddit_sentiment_pd) > 0:
        most_positive = subreddit_sentiment_pd.loc[subreddit_sentiment_pd['avg_sentiment'].idxmax()]
        most_negative = subreddit_sentiment_pd.loc[subreddit_sentiment_pd['avg_sentiment'].idxmin()]
        print(f"   Most positive subreddit: {most_positive['subreddit.name']} (score: {most_positive['avg_sentiment']:.3f})")
        print(f"   Most negative subreddit: {most_negative['subreddit.name']} (score: {most_negative['avg_sentiment']:.3f})")

    print("\n💾 **Saving Enhanced Data**:")
    output_path = "/home/jovyan/work/data/processed/sentiment_analyzed_comments.parquet"
    print(f"Saving to: {output_path}")

    try:
        df_categorized.select(
            "id", "`subreddit.name`", "created_utc", "timestamp",
            "body", "cleaned_body", "tokens_cleaned", "sentiment",
            "vader_sentiment", "sentiment_category", "score"
        ).write.mode("overwrite").parquet(output_path)
        print("✅ Sentiment analysis results saved successfully!")
    except Exception as e:
        print(f"❌ Save failed: {e}")

    print("\n🎯 **Next Steps Recommendations**:")
    print("   1. Perform topic modeling (LDA) based on sentiment analysis results")
    print("   2. Train machine learning classification models")
    print("   3. Analyze impact of specific events on sentiment")
    print("   4. Build sentiment prediction models")

    print("\n✅ Exploratory data analysis and sentiment analysis completed!")

except NameError:
    print("❌ Analysis data not ready, please run all previous cells first")
