In [1]:
# 导入必要的库
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, IDF, VectorAssembler
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# 设置图表样式
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("库导入完成！")


库导入完成！


In [2]:
# 初始化Spark Session
spark = SparkSession.builder \
    .appName("TweetAnalysis_TopicModeling") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Available cores: {spark.sparkContext.defaultParallelism}")


Spark Version: 3.5.0
Available cores: 20


In [3]:
# 加载清洗后的数据和情感分析结果
sentiment_data_path = "/home/jovyan/work/data/processed/sentiment_analyzed_comments.parquet"

try:
    df_sentiment = spark.read.parquet(sentiment_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ 情感分析数据加载完成，共 {record_count:,} 条记录")
    
    print("\n数据结构:")
    df_sentiment.printSchema()
    
    # 检查tokens_cleaned列是否存在
    if 'tokens_cleaned' in df_sentiment.columns:
        print("\n✅ 找到tokens_cleaned列，可以直接进行主题建模")
    else:
        print("\n❌ 未找到tokens_cleaned列，需要重新分词")
        
except Exception as e:
    print(f"❌ 数据加载失败: {e}")
    print("尝试加载清洗后的数据...")
    
    # 备选方案：加载清洗后的数据
    cleaned_data_path = "/home/jovyan/work/data/processed/cleaned_comments.parquet"
    df_sentiment = spark.read.parquet(cleaned_data_path)
    df_sentiment.cache()
    record_count = df_sentiment.count()
    print(f"✅ 清洗后数据加载完成，共 {record_count:,} 条记录")


✅ 情感分析数据加载完成，共 459,171 条记录

数据结构:
root
 |-- id: string (nullable = true)
 |-- subreddit.name: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- body: string (nullable = true)
 |-- cleaned_body: string (nullable = true)
 |-- tokens_cleaned: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentiment: double (nullable = true)
 |-- vader_sentiment: double (nullable = true)
 |-- sentiment_category: string (nullable = true)
 |-- score: long (nullable = true)


✅ 找到tokens_cleaned列，可以直接进行主题建模


In [4]:
# 数据预处理：过滤和准备主题建模数据
print("=== 数据预处理 ===")

# 1. 过滤掉token数量过少的文档（提高主题质量）
df_filtered = df_sentiment.filter(
    F.size(F.col("tokens_cleaned")) >= 5  # 至少5个词
)

filtered_count = df_filtered.count()
print(f"过滤后数据量: {filtered_count:,} 条记录")
print(f"保留比例: {filtered_count/record_count*100:.1f}%")

# 2. 进一步过滤Climate Change相关关键词，确保主题相关性
climate_keywords = [
    'climate', 'warming', 'carbon', 'emission', 'greenhouse', 'temperature',
    'fossil', 'renewable', 'energy', 'pollution', 'environment', 'sustainability',
    'weather', 'ice', 'sea', 'level', 'drought', 'flood'
]

# 创建过滤条件：至少包含一个climate相关词汇
def contains_climate_keywords(tokens):
    if tokens is None:
        return False
    tokens_lower = [token.lower() for token in tokens]
    return any(keyword in tokens_lower for keyword in climate_keywords)

contains_climate_udf = F.udf(contains_climate_keywords, BooleanType())

df_climate = df_filtered.filter(
    contains_climate_udf(F.col("tokens_cleaned"))
)

climate_count = df_climate.count()
print(f"Climate相关评论: {climate_count:,} 条记录")
print(f"占过滤后数据: {climate_count/filtered_count*100:.1f}%")

# 缓存最终用于建模的数据
df_climate.cache()
print(f"\n最终用于主题建模的数据: {climate_count:,} 条记录")


=== 数据预处理 ===
过滤后数据量: 447,889 条记录
保留比例: 97.5%
Climate相关评论: 439,212 条记录
占过滤后数据: 98.1%

最终用于主题建模的数据: 439,212 条记录


In [5]:
# 构建词汇特征向量
print("=== 构建词汇特征向量 ===")

# 1. 采样数据以减少内存压力
print("1. 采样数据以减少计算负担...")
sample_fraction = 0.3  # 使用30%的数据进行主题建模
df_sample = df_climate.sample(fraction=sample_fraction, seed=42)
sample_count = df_sample.count()
print(f"采样后数据量: {sample_count:,} 条记录 (原数据的 {sample_fraction*100}%)")

# 2. CountVectorizer：将tokens转换为词频向量
# 设置更小的词汇表大小和更高的最小词频，避免内存问题
count_vectorizer = CountVectorizer(
    inputCol="tokens_cleaned", 
    outputCol="raw_features",
    vocabSize=2000,  # 减少词汇表大小
    minDF=10.0       # 提高最小文档频率
)

print("2. 训练CountVectorizer...")
count_model = count_vectorizer.fit(df_sample)
df_vectorized = count_model.transform(df_sample)

# 3. TF-IDF：计算词汇重要性权重
idf = IDF(inputCol="raw_features", outputCol="features")
print("3. 训练IDF...")
idf_model = idf.fit(df_vectorized)
df_tfidf = idf_model.transform(df_vectorized)

print(f"词汇表大小: {len(count_model.vocabulary)}")
print(f"特征向量维度: {len(count_model.vocabulary)}")

# 显示词汇表中的一些示例词汇
print("\n词汇表示例（前20个词）:")
for i, word in enumerate(count_model.vocabulary[:20]):
    print(f"{i}: {word}")

# 更新climate_count为采样后的数量
climate_count = sample_count


=== 构建词汇特征向量 ===
1. 采样数据以减少计算负担...
采样后数据量: 132,035 条记录 (原数据的 30.0%)
2. 训练CountVectorizer...
3. 训练IDF...
词汇表大小: 2000
特征向量维度: 2000

词汇表示例（前20个词）:
0: climate
1: change
2: people
3: like
4: re
5: think
6: one
7: even
8: .
9: m
10: get
11: change.
12: also
13: going
14: us
15: much
16: make
17: world
18: need
19: global


In [6]:
# LDA主题建模
print("=== LDA主题建模 ===")

# 设置主题数量（减少主题数量以适应较小的数据集）
NUM_TOPICS = 5  # 减少主题数量

# 创建LDA模型
lda = LDA(
    featuresCol="features", 
    topicsCol="topic_distribution",
    k=NUM_TOPICS,
    maxIter=10,  # 减少迭代次数以加快训练
    seed=42
)

print(f"开始训练LDA模型（{NUM_TOPICS}个主题）...")
lda_model = lda.fit(df_tfidf)

print("\n✅ LDA模型训练完成！")
print(f"模型困惑度: {lda_model.logPerplexity(df_tfidf):.2f}")
print(f"模型对数似然: {lda_model.logLikelihood(df_tfidf):.2f}")


=== LDA主题建模 ===


TypeError: LDA.__init__() got an unexpected keyword argument 'topicsCol'

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 49896)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [None]:
# 提取和分析主题关键词
print("=== 主题关键词分析 ===")

# 获取每个主题的关键词
topics = lda_model.describeTopics(maxTermsPerTopic=15)
vocabulary = count_model.vocabulary

def get_topic_words(topic_data):
    """将主题的词汇索引转换为实际词汇"""
    topics_list = []
    
    for row in topic_data.collect():
        topic_id = row['topic']
        term_indices = row['termIndices']
        term_weights = row['termWeights']
        
        # 转换索引为词汇
        words = [vocabulary[idx] for idx in term_indices]
        
        topics_list.append({
            'topic_id': topic_id,
            'words': words,
            'weights': term_weights
        })
    
    return topics_list

topic_words = get_topic_words(topics)

# 显示每个主题的关键词
print("\n=== 主题关键词列表 ===")
for topic in topic_words:
    topic_id = topic['topic_id']
    words = topic['words'][:10]  # 显示前10个关键词
    weights = topic['weights'][:10]
    
    print(f"\n🔍 主题 {topic_id}:")
    for word, weight in zip(words, weights):
        print(f"  {word}: {weight:.4f}")
    
    # 生成主题描述（基于关键词）
    key_terms = ", ".join(words[:5])
    print(f"  💡 关键术语: {key_terms}")


In [None]:
# 为文档分配主题
print("=== 文档主题分配 ===")

# 转换文档，获得主题分布
df_topics = lda_model.transform(df_tfidf)

# 为每个文档分配主导主题（概率最高的主题）
def get_dominant_topic(topic_distribution):
    """获取主导主题ID"""
    if topic_distribution is None:
        return -1
    return int(np.argmax(topic_distribution.toArray()))

def get_topic_probability(topic_distribution):
    """获取主导主题的概率"""
    if topic_distribution is None:
        return 0.0
    return float(np.max(topic_distribution.toArray()))

get_dominant_topic_udf = F.udf(get_dominant_topic, IntegerType())
get_topic_prob_udf = F.udf(get_topic_probability, DoubleType())

# 添加主导主题列
df_topics = df_topics.withColumn(
    "dominant_topic", 
    get_dominant_topic_udf(F.col("topic_distribution"))
).withColumn(
    "topic_probability",
    get_topic_prob_udf(F.col("topic_distribution"))
)

print("✅ 主题分配完成")

# 显示主题分布统计
print("\n主题分布统计:")
topic_dist = df_topics.groupBy("dominant_topic").count().orderBy("dominant_topic")
topic_dist.show()

# 转换为Pandas进行更详细的分析
topic_dist_pd = topic_dist.toPandas()
total_docs = topic_dist_pd['count'].sum()
topic_dist_pd['percentage'] = (topic_dist_pd['count'] / total_docs * 100).round(2)

print("\n详细主题分布:")
for _, row in topic_dist_pd.iterrows():
    topic_id = int(row['dominant_topic'])
    count = int(row['count'])
    pct = row['percentage']
    
    # 获取主题关键词
    if topic_id >= 0 and topic_id < len(topic_words):
        topic_keywords = ", ".join(topic_words[topic_id]['words'][:3])
    else:
        topic_keywords = "未定义"
    
    print(f"主题 {topic_id} ({topic_keywords}): {count:,} 文档 ({pct}%)")


In [None]:
# 分析每个主题下的情感分布
print("=== 主题情感分析 ===")

# 检查是否有VADER情感分数
sentiment_cols = [col for col in df_topics.columns if 'vader' in col.lower() or 
                 col in ['sentiment', 'compound_score', 'pos_score', 'neu_score', 'neg_score']]

print(f"找到的情感相关列: {sentiment_cols}")

# 如果有VADER分数，使用VADER；否则使用原始sentiment
if 'compound_score' in df_topics.columns:
    sentiment_col = 'compound_score'
    print("使用VADER compound score进行情感分析")
elif 'sentiment' in df_topics.columns:
    sentiment_col = 'sentiment'
    print("使用原始sentiment score进行情感分析")
else:
    print("❌ 未找到情感分数列，跳过情感分析")
    sentiment_col = None

if sentiment_col:
    # 定义情感分类函数
    def classify_sentiment(score):
        if score is None:
            return "未知"
        elif score > 0.1:
            return "积极"
        elif score < -0.1:
            return "消极"
        else:
            return "中性"
    
    classify_sentiment_udf = F.udf(classify_sentiment, StringType())
    
    # 添加情感分类列
    df_topic_sentiment = df_topics.withColumn(
        "sentiment_label",
        classify_sentiment_udf(F.col(sentiment_col))
    )
    
    # 计算每个主题的情感分布
    topic_sentiment_dist = df_topic_sentiment.groupBy("dominant_topic", "sentiment_label").count().orderBy("dominant_topic", "sentiment_label")
    
    print("\n各主题情感分布:")
    topic_sentiment_dist.show()
    
    # 转换为透视表格式便于分析
    topic_sentiment_pd = topic_sentiment_dist.toPandas()
    pivot_sentiment = topic_sentiment_pd.pivot(index='dominant_topic', columns='sentiment_label', values='count').fillna(0)
    
    # 计算比例
    pivot_sentiment_pct = pivot_sentiment.div(pivot_sentiment.sum(axis=1), axis=0) * 100
    
    print("\n各主题情感分布比例(%)：")
    print(pivot_sentiment_pct.round(2))
    
    # 分析每个主题的情感特征
    print("\n=== 主题情感特征分析 ===")
    for topic_id in range(NUM_TOPICS):
        if topic_id in pivot_sentiment_pct.index:
            row = pivot_sentiment_pct.loc[topic_id]
            pos_pct = row.get('积极', 0)
            neg_pct = row.get('消极', 0)
            neu_pct = row.get('中性', 0)
            
            # 获取主题关键词
            keywords = ", ".join(topic_words[topic_id]['words'][:5])
            
            # 判断主题情感倾向
            if pos_pct > neg_pct + 10:
                tendency = "偏积极"
            elif neg_pct > pos_pct + 10:
                tendency = "偏消极"
            else:
                tendency = "中性"
            
            print(f"\n主题 {topic_id} ({keywords}):")
            print(f"  情感倾向: {tendency}")
            print(f"  积极: {pos_pct:.1f}% | 中性: {neu_pct:.1f}% | 消极: {neg_pct:.1f}%")
            
            # 计算情感极化程度
            polarization = abs(pos_pct - neg_pct)
            print(f"  情感极化程度: {polarization:.1f}% ({'高' if polarization > 20 else '中' if polarization > 10 else '低'})")


In [None]:
# 保存主题建模结果
print("=== 保存结果 ===")

# 1. 保存带主题信息的数据
output_path = "/home/jovyan/work/data/processed/topic_analyzed_comments.parquet"

# 选择需要保存的列
columns_to_save = [
    "id", "`subreddit.name`", "timestamp", "cleaned_body", 
    "dominant_topic", "topic_probability"
]

# 如果有情感分析结果，也包含进去
if sentiment_col:
    columns_to_save.extend([sentiment_col, "sentiment_label"])

# 确保列存在再保存
available_cols = [col for col in columns_to_save if col in df_topic_sentiment.columns]

df_result = df_topic_sentiment.select(available_cols)
df_result.write.mode("overwrite").parquet(output_path)

print(f"✅ 主题分析结果已保存到: {output_path}")
print(f"保存列数: {len(available_cols)}")
print(f"保存记录数: {df_result.count():,}")

# 2. 保存主题关键词总结
import json

topic_summary = {
    "model_info": {
        "num_topics": NUM_TOPICS,
        "vocab_size": len(count_model.vocabulary),
        "total_documents": climate_count,
        "log_perplexity": lda_model.logPerplexity(df_tfidf),
        "log_likelihood": lda_model.logLikelihood(df_tfidf)
    },
    "topics": []
}

for topic_id, topic_data in enumerate(topic_words):
    topic_info = {
        "topic_id": topic_id,
        "keywords": topic_data['words'][:10],
        "weights": [float(w) for w in topic_data['weights'][:10]],
        "document_count": int(topic_dist_pd[topic_dist_pd['dominant_topic'] == topic_id]['count'].iloc[0]) if topic_id in topic_dist_pd['dominant_topic'].values else 0
    }
    
    # 如果有情感数据，添加情感分布
    if sentiment_col and topic_id in pivot_sentiment_pct.index:
        topic_info["sentiment_distribution"] = {
            "positive": float(pivot_sentiment_pct.loc[topic_id].get('积极', 0)),
            "neutral": float(pivot_sentiment_pct.loc[topic_id].get('中性', 0)),
            "negative": float(pivot_sentiment_pct.loc[topic_id].get('消极', 0))
        }
    
    topic_summary["topics"].append(topic_info)

# 保存主题总结到JSON文件
summary_path = "/home/jovyan/work/data/processed/topic_summary.json"
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(topic_summary, f, ensure_ascii=False, indent=2)

print(f"✅ 主题总结已保存到: {summary_path}")

print("\n=== LDA主题建模完成！ ===")
print(f"共识别出 {NUM_TOPICS} 个主题")
print(f"处理了 {climate_count:,} 条climate相关评论")
print("主题分析结果可用于后续的分类建模")
