# 02 互动网络分析

目标：构建回复/引用网络，评估中心性、社区结构与地理属性。

In [1]:
import sys
from pathlib import Path

# 将项目根目录添加到 Python 路径
project_root = Path('/workspace')
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    
print(f"✅ Python 路径已配置: {project_root}")

✅ Python 路径已配置: /workspace


## 步骤 1: 加载已处理的数据

In [2]:
from src import analysis
import polars as pl
import networkx as nx
from pathlib import Path

# 加载 parquet 数据
df = pl.read_parquet("../parquet/tweets_enriched.parquet")
print(f"📊 数据加载完成: {df.height:,} 行")

# 统计回复推文
reply_count = df.filter(pl.col('isReply') == True).height
print(f"  其中回复推文: {reply_count:,} 条 ({reply_count/df.height*100:.1f}%)")

📊 数据加载完成: 508,954 行
  其中回复推文: 241,881 条 (47.5%)


## 步骤 2: 构建互动网络边列表

In [3]:
# 准备回复网络（作者 -> 被回复的用户名）
reply_edges = analysis.prepare_network_projection(
    df.filter(pl.col('isReply') == True),
    source_col='pseudo_author_userName',
    target_col='pseudo_inReplyToUsername'
)

print(f"📈 回复网络边列表:")
print(f"  边数: {reply_edges.height:,}")
print(f"  唯一作者: {reply_edges['pseudo_author_userName'].n_unique():,}")
print(f"  唯一目标: {reply_edges['pseudo_inReplyToUsername'].n_unique():,}")

print(f"\n互动最频繁的连接 (top 5):")
print(reply_edges.sort('weight', descending=True).head(5))

📈 回复网络边列表:
  边数: 211,883
  唯一作者: 131,592
  唯一目标: 73,599

互动最频繁的连接 (top 5):
shape: (5, 3)
┌────────────────────────┬──────────────────────────┬────────┐
│ pseudo_author_userName ┆ pseudo_inReplyToUsername ┆ weight │
│ ---                    ┆ ---                      ┆ ---    │
│ i64                    ┆ i64                      ┆ u32    │
╞════════════════════════╪══════════════════════════╪════════╡
│ 515433734353119        ┆ 486555417218587          ┆ 204    │
│ 515433734353119        ┆ 370567833381860          ┆ 162    │
│ 515433734353119        ┆ 993212410226533          ┆ 109    │
│ 515433734353119        ┆ 46800858424560           ┆ 81     │
│ 515433734353119        ┆ 888581008844448          ┆ 65     │
└────────────────────────┴──────────────────────────┴────────┘


## 步骤 3: 构建 NetworkX 图并计算中心性

In [4]:
# 构建有向图
G = nx.DiGraph()

# 添加边（带权重）
for row in reply_edges.iter_rows(named=True):
    G.add_edge(row['pseudo_author_userName'], row['pseudo_inReplyToUsername'], weight=row['weight'])

print(f"🕸️ 网络图构建完成:")
print(f"  节点数: {G.number_of_nodes():,}")
print(f"  边数: {G.number_of_edges():,}")
print(f"  平均度数: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")

# 计算度中心性（取 top 10）
degree_centrality = nx.degree_centrality(G)
top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]

print(f"\n📊 度中心性 Top 10:")
for i, (node, score) in enumerate(top_degree, 1):
    print(f"  {i}. {node}: {score:.4f}")

🕸️ 网络图构建完成:
  节点数: 189,369
  边数: 211,883
  平均度数: 2.24

📊 度中心性 Top 10:
  1. 515433734353119: 0.0909
  2. 845325022118434: 0.0125
  3. 922889366111035: 0.0063
  4. 100322182103332: 0.0057
  5. 302630543166588: 0.0056
  6. 26140580241844: 0.0055
  7. 492496883932672: 0.0054
  8. 268699992546200: 0.0051
  9. 318252820326635: 0.0050
  10. 659998184746904: 0.0046


## 步骤 4: 保存网络数据

In [5]:
from src import io

# 保存边列表
edges_path = Path("../parquet/network_edges.parquet")
io.materialize_parquet(reply_edges.lazy(), edges_path)
print(f"✅ 网络边列表已保存: {edges_path}")

# 保存中心性指标
centrality_df = pl.DataFrame({
    'node': list(degree_centrality.keys()),
    'degree_centrality': list(degree_centrality.values())
})
centrality_path = Path("../parquet/network_centrality.parquet")
io.materialize_parquet(centrality_df.lazy(), centrality_path)
print(f"✅ 中心性指标已保存: {centrality_path}")

print(f"\n📂 生成的文件:")
for f in io.list_parquet_files():
    print(f"  - {f}")

✅ 网络边列表已保存: ../parquet/network_edges.parquet
✅ 中心性指标已保存: ../parquet/network_centrality.parquet

📂 生成的文件:
  - /workspace/src/notebooks/parquet/author_profiling.parquet
  - /workspace/src/notebooks/parquet/content_analysis.parquet
  - /workspace/src/notebooks/parquet/content_analysis_legacy.parquet
  - /workspace/src/notebooks/parquet/emotion_evolution.parquet
  - /workspace/src/notebooks/parquet/narrative_evolution.parquet
  - /workspace/src/notebooks/parquet/narrative_hourly.parquet
  - /workspace/src/notebooks/parquet/network_centrality.parquet
  - /workspace/src/notebooks/parquet/network_edges.parquet
  - /workspace/src/notebooks/parquet/top_50_influencers.parquet
  - /workspace/src/notebooks/parquet/topic_distribution.parquet
  - /workspace/src/notebooks/parquet/tweets_anomalies.parquet
  - /workspace/src/notebooks/parquet/tweets_daily.parquet
  - /workspace/src/notebooks/parquet/tweets_enriched.parquet
  - /workspace/src/notebooks/parquet/tweets_hourly.parquet
  - /workspace/src/no

## ✅ 网络分析完成！

数据已准备好用于 dashboard 可视化。