In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict
from statsmodels.stats.proportion import proportions_ztest

def compute_stats(authors_pkl, papers_pkl, citations_pkl, paper_authors_pkl):
    """
    Read the four pkl files, compute authors' total citations, self-citations and self-citation ratio,
    and return the merged DataFrame with additional paper count and author name,
    plus the filtered authors (>=3 papers).
    """
    # ======== 1) Read data ========
    df_authors = pd.read_pickle(authors_pkl)
    df_papers = pd.read_pickle(papers_pkl)
    df_citations = pd.read_pickle(citations_pkl)
    df_paper_authors = pd.read_pickle(paper_authors_pkl)

    # ======== 2) Build mapping from paper_id to set of author_ids ========
    paper_to_authors = df_paper_authors.groupby('paper_id')['author_id'].apply(set).to_dict()

    # ======== 3) Count total citations & self-citations for each author ========
    author_total_citations = defaultdict(int)
    author_self_citations  = defaultdict(int)

    for _, row in df_citations.iterrows():
        citing_pid = row['citing_paper_id']
        cited_pid  = row['cited_paper_id']
        if citing_pid not in paper_to_authors or cited_pid not in paper_to_authors:
            continue
        citing_authors = paper_to_authors[citing_pid]
        cited_authors  = paper_to_authors[cited_pid]

        # Update total citations for each cited author
        for a in cited_authors:
            author_total_citations[a] += 1

        # If there is an overlap, count as self-citation
        overlap = citing_authors.intersection(cited_authors)
        for a in overlap:
            author_self_citations[a] += 1

    # Build stats DataFrame
    df_citation_stats = pd.DataFrame({
        'author_id':      list(author_total_citations.keys()),
        'total_citations':[author_total_citations[a] for a in author_total_citations],
        'self_citations': [author_self_citations[a]  for a in author_total_citations]
    })

    # ======== 4) Compute self-citation ratio ========
    df_citation_stats['self_citation_ratio'] = (
        df_citation_stats['self_citations'] / df_citation_stats['total_citations']
    ).replace([np.inf, np.nan], 0)

    # ======== 5) Count paper number for each author ========
    df_author_paper_count = (
        df_paper_authors.groupby('author_id')['paper_id']
        .nunique()
        .reset_index(name='paper_count')
    )

    # ======== 6) Merge data (outer to include authors with zero citations) ========
    df_merged = pd.merge(df_citation_stats,
                         df_author_paper_count,
                         on='author_id', how='outer')
    # Fill zeros for authors missing in citation stats
    for col in ['total_citations', 'self_citations', 'self_citation_ratio']:
        df_merged[col] = df_merged[col].fillna(0)

    # Merge in author names
    df_merged = pd.merge(
        df_merged,
        df_authors[['id', 'name']],
        left_on='author_id', right_on='id',
        how='left'
    ).drop(columns=['id'])

    # ======== 7) Filter authors with at least 3 papers ========
    df_filtered = df_merged[df_merged['paper_count'] >= 3].copy()

    return df_merged, df_filtered

# ====== 路径替换为你本地文件 ======
bad_authors_pkl       = 'data/pred/dataset/pkl/authors.pkl'
bad_papers_pkl        = 'data/pred/dataset/pkl/papers.pkl'
bad_citations_pkl     = 'data/pred/dataset/pkl/citations.pkl'
bad_paper_authors_pkl = 'data/pred/dataset/pkl/paper_authors.pkl'

good_authors_pkl       = 'data/norm/dataset/pkl/authors.pkl'
good_papers_pkl        = 'data/norm/dataset/pkl/papers.pkl'
good_citations_pkl     = 'data/norm/dataset/pkl/citations.pkl'
good_paper_authors_pkl = 'data/norm/dataset/pkl/paper_authors.pkl'

# 计算
_, df_filtered_bad  = compute_stats(bad_authors_pkl,
                                    bad_papers_pkl,
                                    bad_citations_pkl,
                                    bad_paper_authors_pkl)
_, df_filtered_good = compute_stats(good_authors_pkl,
                                    good_papers_pkl,
                                    good_citations_pkl,
                                    good_paper_authors_pkl)

# 确认人数
print("Predatory authors (>=3 papers):", len(df_filtered_bad))  # 应为 1384
print("Regular   authors (>=3 papers):", len(df_filtered_good)) # 应为 1830

# —— Binary self-citation flag: 1 if self_citation_ratio > 0, else 0 —— 
df_filtered_bad['has_self']  = (df_filtered_bad['self_citation_ratio'] > 0).astype(int)
df_filtered_good['has_self'] = (df_filtered_good['self_citation_ratio'] > 0).astype(int)

# —— Count and sample sizes —— 
count_bad  = df_filtered_bad['has_self'].sum()
n_bad      = len(df_filtered_bad)
count_good = df_filtered_good['has_self'].sum()
n_good     = len(df_filtered_good)

# —— Run proportion z-test —— 
z_stat, p_val = proportions_ztest([count_bad, count_good], [n_bad, n_good],alternative='larger')

# —— Print result —— 
print(f"Predatory: {count_bad}/{n_bad} = {count_bad/n_bad:.2%}")
print(f"Regular:   {count_good}/{n_good} = {count_good/n_good:.2%}")
print(f"Z-test: z = {z_stat:.3f}, p = {p_val:.5f}")


Predatory authors (>=3 papers): 1384
Regular   authors (>=3 papers): 1830
Predatory: 380/1384 = 27.46%
Regular:   324/1830 = 17.70%
Z-test: z = 6.619, p = 0.00000
