In [24]:
#connect google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1.谁最活跃

In [25]:
import pandas as pd

# 加载文件
articles_df = pd.read_csv('data/articles_field.csv')
authors_df = pd.read_csv('data/authors.schistosomiasis.csv')
paper_counts_df = pd.read_csv('data/paper_counts.csv')

# 研究活动分析
author_activity = authors_df.groupby('AuthorLastname').size().reset_index(name='PublicationCount')
top_active_authors = author_activity.sort_values(by='PublicationCount', ascending=False).head(20)

# 显示前20名活跃研究者
print("Top 20 Active Researchers in Schistosomiasis:")
print(top_active_authors)

FileNotFoundError: [Errno 2] No such file or directory: 'data/articles_field.csv'

In [None]:
 #可视化
 import matplotlib.pyplot as plt

 # 绘制活跃研究人员的柱状图
plt.figure(figsize=(10, 6))
plt.bar(top_active_authors['AuthorLastname'], top_active_authors['PublicationCount'])
plt.xticks(rotation=45, ha='right')
plt.title('Top 20 Active Researchers in Schistosomiasis')
plt.xlabel('Author Last Name')
plt.ylabel('Publication Count')
plt.show()

2.这些最活跃的与谁合作频率多

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 获取前20名活跃研究人员
author_activity = authors_df.groupby('AuthorLastname').size().reset_index(name='PublicationCount')
top_active_authors = author_activity.sort_values(by='PublicationCount', ascending=False).head(20)
top_authors = top_active_authors['AuthorLastname'].tolist()

# 筛选与前20名作者相关的合作数据
top_authors_data = authors_df[authors_df['AuthorLastname'].isin(top_authors)]

# 构建合作频率统计
cooperation = (
    top_authors_data.merge(top_authors_data, on='PMID')
    .query('AuthorLastname_x != AuthorLastname_y')
    .groupby(['AuthorLastname_x', 'AuthorLastname_y'])
    .size()
    .reset_index(name='CollaborationCount')
)

# 构建合作矩阵
cooperation_matrix = cooperation.pivot(
    index='AuthorLastname_x', columns='AuthorLastname_y', values='CollaborationCount'
).fillna(0)

# 绘制热力图
plt.figure(figsize=(12, 10))
sns.heatmap(
    cooperation_matrix,
    annot=False,
    cmap="Reds",
    square=True,
    cbar_kws={'label': 'Frequency of co-operation'}
)
plt.title("Collaboration heat map of top 20 researchers", fontsize=16)
plt.xlabel("researcher", fontsize=12)
plt.ylabel("researcher", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

3. 低产但高质量研究者

In [None]:
# 统计每位作者的论文总数和总合作频率
author_stats = authors_df.groupby('AuthorLastname').size().reset_index(name='TotalPapers')
author_stats['TotalCollaborations'] = cooperation_matrix.sum(axis=1).reindex(author_stats['AuthorLastname']).fillna(0).values

# 定义低产研究者（如论文总数处于10%分位数以下）
low_output_threshold = author_stats['TotalPapers'].quantile(0.1)
low_output_authors = author_stats[author_stats['TotalPapers'] <= low_output_threshold]

# 识别高质量研究者（例如合作频率较高的低产作者）
high_quality_authors = low_output_authors[low_output_authors['TotalCollaborations'] > low_output_authors['TotalCollaborations'].mean()]

# 可视化结果
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(
    author_stats['TotalPapers'],
    author_stats['TotalCollaborations'],
    alpha=0.6, label='All Authors'
)
plt.scatter(
    high_quality_authors['TotalPapers'],
    high_quality_authors['TotalCollaborations'],
    color='red', label='High-Quality Low-Output Authors'
)
plt.axvline(low_output_threshold, color='green', linestyle='--', label='Low Output Threshold')
plt.xlabel('Total Papers')
plt.ylabel('Total Collaborations')
plt.title('Identifying High-Quality Low-Output Researchers')
plt.legend()
plt.show()