In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, IDF, VectorAssembler
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set chart style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")


In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("TweetAnalysis_TopicModeling") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Available cores: {spark.sparkContext.defaultParallelism}")


In [None]:
# loadingcleaned data and sentiment analysisresults
sentiment_data_path = "/home/jovyan/work/data/processed/sentiment_analyzed_comments.parquet"

try:
    df_sentiment = spark.read.parquet(sentiment_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ sentiment analysis data loadingcompleted，total {record_count:,} records")
    
    print("\nData structure:")
    df_sentiment.printSchema()
    
    # Checkingwhether tokens_cleaned column exists
    if 'tokens_cleaned' in df_sentiment.columns:
        print("\n✅ Found tokens_cleaned column，can proceed directly with topic modeling")
    else:
        print("\n❌ tokens_cleaned column not found，need to re-tokenize")
        
except Exception as e:
    print(f"❌ Data loading failed: {e}")
    print("try loading cleaned data...")
    
    # alternativesolution：loadingcleaned data
    cleaned_data_path = "/home/jovyan/work/data/processed/cleaned_comments.parquet"
    df_sentiment = spark.read.parquet(cleaned_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ cleaned data loadingcompleted，total {record_count:,} records")


In [None]:
# data preprocessing：filter and prepare topic modelingdata
print("=== data preprocessing ===")

# 1. Filter out documents with too few tokens（improve topic quality）
df_filtered = df_sentiment.filter(
    F.size(F.col("tokens_cleaned")) >= 5  # at least 5 words
)

filtered_count = df_filtered.count()
print(f"Data size after filtering: {filtered_count:,} records")
print(f"Retention ratio: {filtered_count/record_count*100:.1f}%")

# 2. Further filter Climate Change related keywords，ensure topic relevance
climate_keywords = [
    'climate', 'warming', 'carbon', 'emission', 'greenhouse', 'temperature',
    'fossil', 'renewable', 'energy', 'pollution', 'environment', 'sustainability',
    'weather', 'ice', 'sea', 'level', 'drought', 'flood'
]

# create filter condition：contains at least one climate-related word
def contains_climate_keywords(tokens):
    if tokens is None:
        return False
    tokens_lower = [token.lower() for token in tokens]
    return any(keyword in tokens_lower for keyword in climate_keywords)

contains_climate_udf = F.udf(contains_climate_keywords, BooleanType())

df_climate = df_filtered.filter(
    contains_climate_udf(F.col("tokens_cleaned"))
)

climate_count = df_climate.count()
print(f"Climate-related comments: {climate_count:,} records")
print(f"percentage of filtered data: {climate_count/filtered_count*100:.1f}%")

# cache final data for modeling
df_climate.cache()
print(f"\nfinal data for topic modeling: {climate_count:,} records")


In [None]:
# Build vocabulary feature vectors
print("=== Build vocabulary feature vectors ===")

# 1. sample data to reduce memory pressure
print("1. 采样data以reduce computational burden...")
sample_fraction = 0.3  # use 30% of data for topic modeling
df_sample = df_climate.sample(fraction=sample_fraction, seed=42)
sample_count = df_sample.count()
print(f"data size after sampling: {sample_count:,} records (of original data {sample_fraction*100}%)")

# 2. CountVectorizer：Convert tokens to term frequency vectors
# set smaller vocabulary size and higher minimum word frequency，avoid memory issues
count_vectorizer = CountVectorizer(
    inputCol="tokens_cleaned", 
    outputCol="raw_features",
    vocabSize=2000,  # Reduce vocabulary size
    minDF=10.0       # improve minimum document frequency
)

print("2. trainingCountVectorizer...")
count_model = count_vectorizer.fit(df_sample)
df_vectorized = count_model.transform(df_sample)

# 3. TF-IDF：Calculate word importance weights
idf = IDF(inputCol="raw_features", outputCol="features")
print("3. trainingIDF...")
idf_model = idf.fit(df_vectorized)
df_tfidf = idf_model.transform(df_vectorized)

print(f"Vocabulary size: {len(count_model.vocabulary)}")
print(f"Feature vector dimensions: {len(count_model.vocabulary)}")

# display some example words in vocabulary
print("\nVocabulary examples（first 20 words）:")
for i, word in enumerate(count_model.vocabulary[:20]):
    print(f"{i}: {word}")

# update climate_count to post-sampling count
climate_count = sample_count


In [None]:
# LDA Topic Modeling
print("=== LDA Topic Modeling ===")

# Set number of topics（reduce number of topics to fit smaller dataset）
NUM_TOPICS = 5  # reduce number of topics

# create LDA model
lda = LDA(
    featuresCol="features", 
    topicsCol="topic_distribution",
    k=NUM_TOPICS,
    maxIter=10,  # reduce iteration count to speed up training
    seed=42
)

print(f"start training LDA model（{NUM_TOPICS}topics）...")
lda_model = lda.fit(df_tfidf)

print("\n✅ LDA model training completed!")
print(f"model perplexity: {lda_model.logPerplexity(df_tfidf):.2f}")
print(f"model log likelihood: {lda_model.logLikelihood(df_tfidf):.2f}")


In [None]:
# Extract and analyze topic keywords
print("=== Topic Keywords Analysis ===")

# get keywords for each topic
topics = lda_model.describeTopics(maxTermsPerTopic=15)
vocabulary = count_model.vocabulary

def get_topic_words(topic_data):
    """Convert topic word indices to actual words"""
    topics_list = []
    
    for row in topic_data.collect():
        topic_id = row['topic']
        term_indices = row['termIndices']
        term_weights = row['termWeights']
        
        # Convert indices to words
        words = [vocabulary[idx] for idx in term_indices]
        
        topics_list.append({
            'topic_id': topic_id,
            'words': words,
            'weights': term_weights
        })
    
    return topics_list

topic_words = get_topic_words(topics)

# display keywords for each topic
print("\n=== Topic Keywords List ===")
for topic in topic_words:
    topic_id = topic['topic_id']
    words = topic['words'][:10]  # Display top 10 keywords
    weights = topic['weights'][:10]
    
    print(f"\n🔍 Topic {topic_id}:")
    for word, weight in zip(words, weights):
        print(f"  {word}: {weight:.4f}")
    
    # generate topic description（based on keywords）
    key_terms = ", ".join(words[:5])
    print(f"  💡 Key terms: {key_terms}")


In [None]:
# fordocument分配Topic
print("=== documentTopic分配 ===")

# convertdocument，获得Topicdistribution
df_topics = lda_model.transform(df_tfidf)

# for每document分配dominantTopic（概率mostHigh的Topic）
def get_dominant_topic(topic_distribution):
    """getdominantTopicID"""
    if topic_distribution is None:
        return -1
    return int(np.argmax(topic_distribution.toArray()))

def get_topic_probability(topic_distribution):
    """getdominantTopic的概率"""
    if topic_distribution is None:
        return 0.0
    return float(np.max(topic_distribution.toArray()))

get_dominant_topic_udf = F.udf(get_dominant_topic, IntegerType())
get_topic_prob_udf = F.udf(get_topic_probability, DoubleType())

# addingdominantTopiccolumns
df_topics = df_topics.withColumn(
    "dominant_topic", 
    get_dominant_topic_udf(F.col("topic_distribution"))
).withColumn(
    "topic_probability",
    get_topic_prob_udf(F.col("topic_distribution"))
)

print("✅ Topic分配completed")

# displayTopicdistribution统计
print("\nTopicdistribution统计:")
topic_dist = df_topics.groupBy("dominant_topic").count().orderBy("dominant_topic")
topic_dist.show()

# Convert to Pandas for detailed analysis
topic_dist_pd = topic_dist.toPandas()
total_docs = topic_dist_pd['count'].sum()
topic_dist_pd['percentage'] = (topic_dist_pd['count'] / total_docs * 100).round(2)

print("\ndetailedTopicdistribution:")
for _, row in topic_dist_pd.iterrows():
    topic_id = int(row['dominant_topic'])
    count = int(row['count'])
    pct = row['percentage']
    
    # getTopickey词
    if topic_id >= 0 and topic_id < len(topic_words):
        topic_keywords = ", ".join(topic_words[topic_id]['words'][:3])
    else:
        topic_keywords = "undefined"
    
    print(f"Topic {topic_id} ({topic_keywords}): {count:,} document ({pct}%)")


In [None]:
# analyze每topics下的情感distribution
print("=== Topicsentiment analysis ===")

# Check if VADER sentiment scores exist
sentiment_cols = [col for col in df_topics.columns if 'vader' in col.lower() or 
                 col in ['sentiment', 'compound_score', 'pos_score', 'neu_score', 'neg_score']]

print(f"Found sentiment-related columns: {sentiment_cols}")

# ifhaveVADERscore，usingVADER；otherwiseusing原始sentiment
if 'compound_score' in df_topics.columns:
    sentiment_col = 'compound_score'
    print("usingVADER compound scoreperformsentiment analysis")
elif 'sentiment' in df_topics.columns:
    sentiment_col = 'sentiment'
    print("using原始sentiment scoreperformsentiment analysis")
else:
    print("❌ Sentiment score column not found，skipsentiment analysis")
    sentiment_col = None

if sentiment_col:
    # definition情感classificationfunction
    def classify_sentiment(score):
        if score is None:
            return "Unknown"
        elif score > 0.1:
            return "Positive"
        elif score < -0.1:
            return "Negative"
        else:
            return "Neutral"
    
    classify_sentiment_udf = F.udf(classify_sentiment, StringType())
    
    # adding情感classificationcolumns
    df_topic_sentiment = df_topics.withColumn(
        "sentiment_label",
        classify_sentiment_udf(F.col(sentiment_col))
    )
    
    # Calculating每topics的情感distribution
    topic_sentiment_dist = df_topic_sentiment.groupBy("dominant_topic", "sentiment_label").count().orderBy("dominant_topic", "sentiment_label")
    
    print("\neachTopic情感distribution:")
    topic_sentiment_dist.show()
    
    # Convert to pivot table format for analysis
    topic_sentiment_pd = topic_sentiment_dist.toPandas()
    pivot_sentiment = topic_sentiment_pd.pivot(index='dominant_topic', columns='sentiment_label', values='count').fillna(0)
    
    # Calculate proportions
    pivot_sentiment_pct = pivot_sentiment.div(pivot_sentiment.sum(axis=1), axis=0) * 100
    
    print("\neachTopic情感distribution比例(%)：")
    print(pivot_sentiment_pct.round(2))
    
    # analyze每topics的情感feature
    print("\n=== Topic情感featureanalyze ===")
    for topic_id in range(NUM_TOPICS):
        if topic_id in pivot_sentiment_pct.index:
            row = pivot_sentiment_pct.loc[topic_id]
            pos_pct = row.get('Positive', 0)
            neg_pct = row.get('Negative', 0)
            neu_pct = row.get('Neutral', 0)
            
            # getTopickey词
            keywords = ", ".join(topic_words[topic_id]['words'][:5])
            
            # 判断TopicSentiment tendency
            if pos_pct > neg_pct + 10:
                tendency = "偏Positive"
            elif neg_pct > pos_pct + 10:
                tendency = "偏Negative"
            else:
                tendency = "Neutral"
            
            print(f"\nTopic {topic_id} ({keywords}):")
            print(f"  Sentiment tendency: {tendency}")
            print(f"  Positive: {pos_pct:.1f}% | Neutral: {neu_pct:.1f}% | Negative: {neg_pct:.1f}%")
            
            # Calculate sentiment polarization degree
            polarization = abs(pos_pct - neg_pct)
            print(f"  Sentiment polarization degree: {polarization:.1f}% ({'High' if polarization > 20 else 'in' if polarization > 10 else 'Low'})")


In [None]:
# savingTopicmodelingresults
print("=== Save Results ===")

# 1. saving带Topicinformation的data
output_path = "/home/jovyan/work/data/processed/topic_analyzed_comments.parquet"

# Select columns to save
columns_to_save = [
    "id", "`subreddit.name`", "timestamp", "cleaned_body", 
    "dominant_topic", "topic_probability"
]

# ifhavesentiment analysisresults，也contains进去
if sentiment_col:
    columns_to_save.extend([sentiment_col, "sentiment_label"])

# Ensure columns exist before saving
available_cols = [col for col in columns_to_save if col in df_topic_sentiment.columns]

df_result = df_topic_sentiment.select(available_cols)
df_result.write.mode("overwrite").parquet(output_path)

print(f"✅ Topicanalyzeresultsalreadysavingto: {output_path}")
print(f"Saved columns: {len(available_cols)}")
print(f"Saved records: {df_result.count():,}")

# 2. savingTopickey词summary
import json

topic_summary = {
    "model_info": {
        "num_topics": NUM_TOPICS,
        "vocab_size": len(count_model.vocabulary),
        "total_documents": climate_count,
        "log_perplexity": lda_model.logPerplexity(df_tfidf),
        "log_likelihood": lda_model.logLikelihood(df_tfidf)
    },
    "topics": []
}

for topic_id, topic_data in enumerate(topic_words):
    topic_info = {
        "topic_id": topic_id,
        "keywords": topic_data['words'][:10],
        "weights": [float(w) for w in topic_data['weights'][:10]],
        "document_count": int(topic_dist_pd[topic_dist_pd['dominant_topic'] == topic_id]['count'].iloc[0]) if topic_id in topic_dist_pd['dominant_topic'].values else 0
    }
    
    # ifhave情感data，adding情感distribution
    if sentiment_col and topic_id in pivot_sentiment_pct.index:
        topic_info["sentiment_distribution"] = {
            "positive": float(pivot_sentiment_pct.loc[topic_id].get('Positive', 0)),
            "neutral": float(pivot_sentiment_pct.loc[topic_id].get('Neutral', 0)),
            "negative": float(pivot_sentiment_pct.loc[topic_id].get('Negative', 0))
        }
    
    topic_summary["topics"].append(topic_info)

# savingTopicsummarytoJSONfile
summary_path = "/home/jovyan/work/data/processed/topic_summary.json"
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(topic_summary, f, ensure_ascii=False, indent=2)

print(f"✅ Topicsummaryalreadysavingto: {summary_path}")

print("\n=== LDA Topic Modelingcompleted! ===")
print(f"Identified {NUM_TOPICS} topics")
print(f"Processed {climate_count:,} climate-related comments")
print("Topicanalyzeresults可forsubsequent的classificationmodeling")
