In [5]:
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict
import ast


In [6]:
def clique_percolation(G, k):
    """
    实现团渗透方法
    k: 团的大小
    """
    # 1. 找到所有k-cliques
    cliques = list(nx.find_cliques(G))
    k_cliques = [c for c in cliques if len(c) >= k]
    
    # 2. 构建clique重叠图
    clique_graph = nx.Graph()
    for i, c1 in enumerate(k_cliques):
        for j, c2 in enumerate(k_cliques[i+1:], i+1):
            if len(set(c1) & set(c2)) >= (k-1):  # 如果两个团共享k-1个节点
                clique_graph.add_edge(i, j)
    
    # 3. 找到连通分量（这些就是我们的社区）
    communities = []
    for component in nx.connected_components(clique_graph):
        community = set()
        for clique_id in component:
            community.update(k_cliques[clique_id])
        communities.append(community)
    
    return communities

In [7]:
def calculate_pairwise_accuracy(communities, G):
    total_pairs = 0
    correct_pairs = 0
    
    for community in communities:
        nodes = list(community)
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)):
                total_pairs += 1
                if G.nodes[nodes[i]]['in_playlist'] == G.nodes[nodes[j]]['in_playlist']:
                    correct_pairs += 1
    
    return correct_pairs / total_pairs if total_pairs > 0 else 0

In [8]:
def main():
    weeks = ['week1.csv', 'week2.csv', 'week3.csv']
    results = []
    
    for week_file in weeks:
        print(f"\n分析 {week_file}")
        
        # 加载数据
        df = pd.read_csv(week_file, index_col=0, low_memory=False)
        
        # 构建图
        G = nx.Graph()
        for idx, row in df.iterrows():
            G.add_node(idx, in_playlist=row['in_playlist'])
            if isinstance(row['collaborators'], str):
                collaborators = ast.literal_eval(row['collaborators'])
                for collaborator in collaborators.keys():
                    if collaborator in df.index:
                        G.add_edge(idx, collaborator)
        
        print(f"网络有 {G.number_of_nodes()} 个节点和 {G.number_of_edges()} 条边")
        
        # 对不同的k值进行团渗透
        for k in [3, 4]:  # k=3和k=4是常用的选择
            print(f"\n使用k={k}的团渗透方法")
            
            # 执行团渗透
            communities = clique_percolation(G, k)
            
            if communities:
                # 计算准确性
                accuracy = calculate_pairwise_accuracy(communities, G)
                
                # 记录结果
                results.append({
                    'week': week_file,
                    'k': k,
                    'n_communities': len(communities),
                    'accuracy': accuracy,
                    'avg_community_size': np.mean([len(c) for c in communities])
                })
                
                print(f"发现 {len(communities)} 个社区")
                print(f"平均社区大小: {results[-1]['avg_community_size']:.2f}")
                print(f"成对准确性: {accuracy:.3f}")
    
    # 保存结果
    results_df = pd.DataFrame(results)
    print("\n最终结果:")
    print(results_df)
    results_df.to_csv('community_percolation_results.csv')

In [None]:
if __name__ == "__main__":
    main()


分析 week1.csv
网络有 15126 个节点和 38651 条边

使用k=3的团渗透方法
发现 313 个社区
平均社区大小: 15.65
成对准确性: 0.928

使用k=4的团渗透方法
发现 131 个社区
平均社区大小: 10.34
成对准确性: 0.975

分析 week2.csv
网络有 15054 个节点和 38695 条边

使用k=3的团渗透方法
发现 310 个社区
平均社区大小: 15.78
成对准确性: 0.929

使用k=4的团渗透方法
发现 129 个社区
平均社区大小: 10.53
成对准确性: 0.976

分析 week3.csv
