In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, IDF, VectorAssembler
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set chart style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("TweetAnalysis_TopicModeling") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Available cores: {spark.sparkContext.defaultParallelism}")


Spark Version: 3.5.0
Available cores: 20


In [3]:
# Load cleaned data and sentiment analysis results
sentiment_data_path = "/home/jovyan/work/data/processed/sentiment_analyzed_comments.parquet"

try:
    df_sentiment = spark.read.parquet(sentiment_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ Sentiment analysis data loaded successfully, total {record_count:,} records")
    
    print("\nData structure:")
    df_sentiment.printSchema()
    
    # Check if tokens_cleaned column exists
    if 'tokens_cleaned' in df_sentiment.columns:
        print("\n✅ Found tokens_cleaned column, can proceed directly with topic modeling")
    else:
        print("\n❌ tokens_cleaned column not found, need to re-tokenize")
        
except Exception as e:
    print(f"❌ Data loading failed: {e}")
    print("Trying to load cleaned data...")
    
    # Alternative solution: load cleaned data
    cleaned_data_path = "/home/jovyan/work/data/processed/cleaned_comments.parquet"
    df_sentiment = spark.read.parquet(cleaned_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ Cleaned data loaded successfully, total {record_count:,} records")


✅ Sentiment analysis data loaded successfully, total 459,171 records

Data structure:
root
 |-- id: string (nullable = true)
 |-- subreddit.name: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- body: string (nullable = true)
 |-- cleaned_body: string (nullable = true)
 |-- tokens_cleaned: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentiment: double (nullable = true)
 |-- vader_sentiment: double (nullable = true)
 |-- sentiment_category: string (nullable = true)
 |-- score: long (nullable = true)


✅ Found tokens_cleaned column, can proceed directly with topic modeling


In [4]:
# Data preprocessing: filter and prepare topic modeling data
print("=== Data Preprocessing ===")

# 1. Filter out documents with too few tokens (improve topic quality)
df_filtered = df_sentiment.filter(
    F.size(F.col("tokens_cleaned")) >= 5  # at least 5 words
)

filtered_count = df_filtered.count()
print(f"Data size after filtering: {filtered_count:,} records")
print(f"Retention ratio: {filtered_count/record_count*100:.1f}%")

# 2. Further filter Climate Change related keywords to ensure topic relevance
climate_keywords = [
    'climate', 'warming', 'carbon', 'emission', 'greenhouse', 'temperature',
    'fossil', 'renewable', 'energy', 'pollution', 'environment', 'sustainability',
    'weather', 'ice', 'sea', 'level', 'drought', 'flood'
]

# Create filter condition: contains at least one climate-related word
def contains_climate_keywords(tokens):
    if tokens is None:
        return False
    tokens_lower = [token.lower() for token in tokens]
    return any(keyword in tokens_lower for keyword in climate_keywords)

contains_climate_udf = F.udf(contains_climate_keywords, BooleanType())

df_climate = df_filtered.filter(
    contains_climate_udf(F.col("tokens_cleaned"))
)

climate_count = df_climate.count()
print(f"Climate-related comments: {climate_count:,} records")
print(f"Percentage of filtered data: {climate_count/filtered_count*100:.1f}%")

# Cache final data for modeling
df_climate.cache()
print(f"\nFinal data for topic modeling: {climate_count:,} records")


=== Data Preprocessing ===
Data size after filtering: 447,889 records
Retention ratio: 97.5%
Climate-related comments: 439,212 records
Percentage of filtered data: 98.1%

Final data for topic modeling: 439,212 records


In [5]:
# Build vocabulary feature vectors
print("=== Building Vocabulary Feature Vectors ===")

# 1. Sample data to reduce memory pressure
print("1. Sampling data to reduce computational burden...")
sample_fraction = 0.3  # Use 30% of data for topic modeling
df_sample = df_climate.sample(fraction=sample_fraction, seed=42)
sample_count = df_sample.count()
print(f"Data size after sampling: {sample_count:,} records ({sample_fraction*100}% of original data)")

# 2. CountVectorizer: Convert tokens to term frequency vectors
# Set smaller vocabulary size and higher minimum word frequency to avoid memory issues
count_vectorizer = CountVectorizer(
    inputCol="tokens_cleaned", 
    outputCol="raw_features",
    vocabSize=2000,  # Reduce vocabulary size
    minDF=10.0       # Increase minimum document frequency
)

print("2. Training CountVectorizer...")
count_model = count_vectorizer.fit(df_sample)
df_vectorized = count_model.transform(df_sample)

# 3. TF-IDF: Calculate word importance weights
idf = IDF(inputCol="raw_features", outputCol="features")
print("3. Training IDF...")
idf_model = idf.fit(df_vectorized)
df_tfidf = idf_model.transform(df_vectorized)

print(f"Vocabulary size: {len(count_model.vocabulary)}")
print(f"Feature vector dimensions: {len(count_model.vocabulary)}")

# Display some example words in vocabulary
print("\nVocabulary examples (first 20 words):")
for i, word in enumerate(count_model.vocabulary[:20]):
    print(f"{i}: {word}")

# Update climate_count to post-sampling count
climate_count = sample_count


=== Building Vocabulary Feature Vectors ===
1. Sampling data to reduce computational burden...
Data size after sampling: 132,035 records (30.0% of original data)
2. Training CountVectorizer...
3. Training IDF...
Vocabulary size: 2000
Feature vector dimensions: 2000

Vocabulary examples (first 20 words):
0: climate
1: change
2: people
3: like
4: re
5: think
6: one
7: even
8: .
9: m
10: get
11: change.
12: also
13: going
14: us
15: much
16: make
17: world
18: need
19: global


In [6]:
# LDA Topic Modeling
print("=== LDA Topic Modeling ===")

# Set number of topics (reduce number of topics to fit smaller dataset)
NUM_TOPICS = 5  # Reduce number of topics

# Create LDA model
lda = LDA(
    featuresCol="features", 
    topicsCol="topic_distribution",
    k=NUM_TOPICS,
    maxIter=10,  # Reduce iteration count to speed up training
    seed=42
)

print(f"Starting LDA model training ({NUM_TOPICS} topics)...")
lda_model = lda.fit(df_tfidf)

print("\n✅ LDA model training completed!")
print(f"Model perplexity: {lda_model.logPerplexity(df_tfidf):.2f}")
print(f"Model log likelihood: {lda_model.logLikelihood(df_tfidf):.2f}")


=== LDA Topic Modeling ===


TypeError: LDA.__init__() got an unexpected keyword argument 'topicsCol'

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 54022)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [None]:
# Extract and analyze topic keywords
print("=== Topic Keywords Analysis ===")

# Get keywords for each topic
topics = lda_model.describeTopics(maxTermsPerTopic=15)
vocabulary = count_model.vocabulary

def get_topic_words(topic_data):
    """Convert topic word indices to actual words"""
    topics_list = []
    
    for row in topic_data.collect():
        topic_id = row['topic']
        term_indices = row['termIndices']
        term_weights = row['termWeights']
        
        # Convert indices to words
        words = [vocabulary[idx] for idx in term_indices]
        
        topics_list.append({
            'topic_id': topic_id,
            'words': words,
            'weights': term_weights
        })
    
    return topics_list

topic_words = get_topic_words(topics)

# Display keywords for each topic
print("\n=== Topic Keywords List ===")
for topic in topic_words:
    topic_id = topic['topic_id']
    words = topic['words'][:10]  # Display top 10 keywords
    weights = topic['weights'][:10]
    
    print(f"\n🔍 Topic {topic_id}:")
    for word, weight in zip(words, weights):
        print(f"  {word}: {weight:.4f}")
    
    # Generate topic description (based on keywords)
    key_terms = ", ".join(words[:5])
    print(f"  💡 Key terms: {key_terms}")


In [None]:
# Assign topics to documents
print("=== Document Topic Assignment ===")

# Transform documents to get topic distribution
df_topics = lda_model.transform(df_tfidf)

# Assign dominant topic for each document (topic with highest probability)
def get_dominant_topic(topic_distribution):
    """Get dominant topic ID"""
    if topic_distribution is None:
        return -1
    return int(np.argmax(topic_distribution.toArray()))

def get_topic_probability(topic_distribution):
    """Get probability of dominant topic"""
    if topic_distribution is None:
        return 0.0
    return float(np.max(topic_distribution.toArray()))

get_dominant_topic_udf = F.udf(get_dominant_topic, IntegerType())
get_topic_prob_udf = F.udf(get_topic_probability, DoubleType())

# Add dominant topic columns
df_topics = df_topics.withColumn(
    "dominant_topic", 
    get_dominant_topic_udf(F.col("topic_distribution"))
).withColumn(
    "topic_probability",
    get_topic_prob_udf(F.col("topic_distribution"))
)

print("✅ Topic assignment completed")

# Display topic distribution statistics
print("\nTopic distribution statistics:")
topic_dist = df_topics.groupBy("dominant_topic").count().orderBy("dominant_topic")
topic_dist.show()

# Convert to Pandas for detailed analysis
topic_dist_pd = topic_dist.toPandas()
total_docs = topic_dist_pd['count'].sum()
topic_dist_pd['percentage'] = (topic_dist_pd['count'] / total_docs * 100).round(2)

print("\nDetailed topic distribution:")
for _, row in topic_dist_pd.iterrows():
    topic_id = int(row['dominant_topic'])
    count = int(row['count'])
    pct = row['percentage']
    
    # Get topic keywords
    if topic_id >= 0 and topic_id < len(topic_words):
        topic_keywords = ", ".join(topic_words[topic_id]['words'][:3])
    else:
        topic_keywords = "undefined"
    
    print(f"Topic {topic_id} ({topic_keywords}): {count:,} documents ({pct}%)")


In [None]:
# Analyze sentiment distribution for each topic
print("=== Topic Sentiment Analysis ===")

# Check if VADER sentiment scores exist
sentiment_cols = [col for col in df_topics.columns if 'vader' in col.lower() or 
                 col in ['sentiment', 'compound_score', 'pos_score', 'neu_score', 'neg_score']]

print(f"Found sentiment-related columns: {sentiment_cols}")

# Use VADER score if available, otherwise use original sentiment
if 'compound_score' in df_topics.columns:
    sentiment_col = 'compound_score'
    print("Using VADER compound score for sentiment analysis")
elif 'sentiment' in df_topics.columns:
    sentiment_col = 'sentiment'
    print("Using original sentiment score for sentiment analysis")
else:
    print("❌ Sentiment score column not found, skipping sentiment analysis")
    sentiment_col = None

if sentiment_col:
    # Define sentiment classification function
    def classify_sentiment(score):
        if score is None:
            return "Unknown"
        elif score > 0.1:
            return "Positive"
        elif score < -0.1:
            return "Negative"
        else:
            return "Neutral"
    
    classify_sentiment_udf = F.udf(classify_sentiment, StringType())
    
    # Add sentiment classification column
    df_topic_sentiment = df_topics.withColumn(
        "sentiment_label",
        classify_sentiment_udf(F.col(sentiment_col))
    )
    
    # Calculate sentiment distribution for each topic
    topic_sentiment_dist = df_topic_sentiment.groupBy("dominant_topic", "sentiment_label").count().orderBy("dominant_topic", "sentiment_label")
    
    print("\nSentiment distribution for each topic:")
    topic_sentiment_dist.show()
    
    # Convert to pivot table format for analysis
    topic_sentiment_pd = topic_sentiment_dist.toPandas()
    pivot_sentiment = topic_sentiment_pd.pivot(index='dominant_topic', columns='sentiment_label', values='count').fillna(0)
    
    # Calculate proportions
    pivot_sentiment_pct = pivot_sentiment.div(pivot_sentiment.sum(axis=1), axis=0) * 100
    
    print("\nSentiment distribution percentage for each topic (%):")
    print(pivot_sentiment_pct.round(2))
    
    # Analyze sentiment characteristics for each topic
    print("\n=== Topic Sentiment Characteristics Analysis ===")
    for topic_id in range(NUM_TOPICS):
        if topic_id in pivot_sentiment_pct.index:
            row = pivot_sentiment_pct.loc[topic_id]
            pos_pct = row.get('Positive', 0)
            neg_pct = row.get('Negative', 0)
            neu_pct = row.get('Neutral', 0)
            
            # Get topic keywords
            keywords = ", ".join(topic_words[topic_id]['words'][:5])
            
            # Determine topic sentiment tendency
            if pos_pct > neg_pct + 10:
                tendency = "Positive-leaning"
            elif neg_pct > pos_pct + 10:
                tendency = "Negative-leaning"
            else:
                tendency = "Neutral"
            
            print(f"\nTopic {topic_id} ({keywords}):")
            print(f"  Sentiment tendency: {tendency}")
            print(f"  Positive: {pos_pct:.1f}% | Neutral: {neu_pct:.1f}% | Negative: {neg_pct:.1f}%")
            
            # Calculate sentiment polarization degree
            polarization = abs(pos_pct - neg_pct)
            print(f"  Sentiment polarization degree: {polarization:.1f}% ({'High' if polarization > 20 else 'Medium' if polarization > 10 else 'Low'})")


In [None]:
# Save topic modeling results
print("=== Save Results ===")

# 1. Save data with topic information
output_path = "/home/jovyan/work/data/processed/topic_analyzed_comments.parquet"

# Select columns to save
columns_to_save = [
    "id", "`subreddit.name`", "timestamp", "cleaned_body", 
    "dominant_topic", "topic_probability"
]

# If sentiment analysis results exist, include them
if sentiment_col:
    columns_to_save.extend([sentiment_col, "sentiment_label"])

# Ensure columns exist before saving
available_cols = [col for col in columns_to_save if col in df_topic_sentiment.columns]

df_result = df_topic_sentiment.select(available_cols)
df_result.write.mode("overwrite").parquet(output_path)

print(f"✅ Topic analysis results saved to: {output_path}")
print(f"Saved columns: {len(available_cols)}")
print(f"Saved records: {df_result.count():,}")

# 2. Save topic keywords summary
import json

topic_summary = {
    "model_info": {
        "num_topics": NUM_TOPICS,
        "vocab_size": len(count_model.vocabulary),
        "total_documents": climate_count,
        "log_perplexity": lda_model.logPerplexity(df_tfidf),
        "log_likelihood": lda_model.logLikelihood(df_tfidf)
    },
    "topics": []
}

for topic_id, topic_data in enumerate(topic_words):
    topic_info = {
        "topic_id": topic_id,
        "keywords": topic_data['words'][:10],
        "weights": [float(w) for w in topic_data['weights'][:10]],
        "document_count": int(topic_dist_pd[topic_dist_pd['dominant_topic'] == topic_id]['count'].iloc[0]) if topic_id in topic_dist_pd['dominant_topic'].values else 0
    }
    
    # If sentiment data exists, add sentiment distribution
    if sentiment_col and topic_id in pivot_sentiment_pct.index:
        topic_info["sentiment_distribution"] = {
            "positive": float(pivot_sentiment_pct.loc[topic_id].get('Positive', 0)),
            "neutral": float(pivot_sentiment_pct.loc[topic_id].get('Neutral', 0)),
            "negative": float(pivot_sentiment_pct.loc[topic_id].get('Negative', 0))
        }
    
    topic_summary["topics"].append(topic_info)

# Save topic summary to JSON file
summary_path = "/home/jovyan/work/data/processed/topic_summary.json"
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(topic_summary, f, ensure_ascii=False, indent=2)

print(f"✅ Topic summary saved to: {summary_path}")

print("\n=== LDA Topic Modeling Completed! ===")
print(f"Identified {NUM_TOPICS} topics")
print(f"Processed {climate_count:,} climate-related comments")
print("Topic analysis results are ready for subsequent classification modeling")
