This notebook creates the following tables:

- publications
- zenodo works
- people on publications
- people on zenodo works
- citations

## Imports

In [1]:
import pandas as pd
import pandas as pd
import os
from pathlib import Path
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
from pyalex import config
from tqdm import tqdm 
import json
import ast

In [2]:
from pyalex import config

config.email = "rde6mn@virginia.edu"
config.max_retries = 5
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

In [3]:
pub_tracker = pd.read_excel('Data\Preliminary Work Data\publication_tracker_icor_250916.xlsx')

  pub_tracker = pd.read_excel('Data\Preliminary Work Data\publication_tracker_icor_250916.xlsx')


In [5]:
zenodo = pd.read_excel('Data\Preliminary Work Data\zenodo_dois.xlsx')

  zenodo = pd.read_excel('Data\Preliminary Work Data\zenodo_dois.xlsx')


## Get openalex id for work and lists of names, open alex ids, and orcids for people on publications with a doi_pub (for published works)

In [6]:
import pandas as pd
import numpy as np

# Initialize your new columns
pub_tracker['OpenAlex_ID'] = np.nan
pub_tracker['ListofAlexIds'] = [[] for _ in range(len(pub_tracker))]
pub_tracker['ListofNames'] = [[] for _ in range(len(pub_tracker))]
pub_tracker['ListofOrcids'] = [[] for _ in range(len(pub_tracker))]

# Iterate through the DOIs
for idx, row in pub_tracker.iterrows():
    doi_suffix = row['doi_pub']
    if pd.notna(doi_suffix):
        try:
            doi_url = f"https://doi.org/{doi_suffix}"
            work_json = Works()[doi_url]

            # Extract OpenAlex ID
            work_id = work_json.get('id', '')
            if work_id.startswith("https://openalex.org/"):
                pub_tracker.at[idx, 'OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")

            # Extract authorship information
            alex_ids = []
            names = []
            orcids = []

            for author_info in work_json.get('authorships', []):
                author = author_info.get('author', {})

                # Extract and clean author ID
                author_id = author.get('id', '')
                if author_id.startswith("https://openalex.org/"):
                    alex_ids.append(author_id.replace("https://openalex.org/", ""))
                else:
                    alex_ids.append(None)

                # Extract name
                names.append(author.get('display_name', None))

                # Extract and clean ORCID
                orcid = author.get('orcid', '')
                if orcid and orcid.startswith("https://orcid.org/"):
                    orcids.append(orcid.replace("https://orcid.org/", ""))
                else:
                    orcids.append(None)

            pub_tracker.at[idx, 'ListofAlexIds'] = alex_ids
            pub_tracker.at[idx, 'ListofNames'] = names
            pub_tracker.at[idx, 'ListofOrcids'] = orcids

        except Exception as e:
            print(f"Error processing DOI {doi_suffix}: {e}")


  pub_tracker.at[idx, 'OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")


Error processing DOI 10.1177/2515256423116249: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.1177%2F2515256423116249


## Get openalex id for work and lists of names, open alex ids, and orcids for people on publications with a doi_pre (for preprint works)

In [10]:
import pandas as pd
import numpy as np

# Initialize your new columns
pub_tracker['Pre_OpenAlex_ID'] = np.nan
pub_tracker['Pre_ListofAlexIds'] = [[] for _ in range(len(pub_tracker))]
pub_tracker['Pre_ListofNames'] = [[] for _ in range(len(pub_tracker))]
pub_tracker['Pre_ListofOrcids'] = [[] for _ in range(len(pub_tracker))]

# Iterate through the DOIs only for rows where OpenAlex_ID is missing
for idx, row in pub_tracker.iterrows():
    if pd.isna(row.get('OpenAlex_ID')) or row.get('OpenAlex_ID') == "":
        doi_suffix = row.get('doi_pre')
        if pd.notna(doi_suffix):
            try:
                doi_url = f"https://doi.org/{doi_suffix}"
                work_json = Works()[doi_url]

                # Extract OpenAlex ID
                work_id = work_json.get('id', '')
                if work_id.startswith("https://openalex.org/"):
                    pub_tracker.at[idx, 'Pre_OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")

                # Extract authorship information
                alex_ids = []
                names = []
                orcids = []

                for author_info in work_json.get('authorships', []):
                    author = author_info.get('author', {})

                    # Extract and clean author ID
                    author_id = author.get('id', '')
                    if author_id.startswith("https://openalex.org/"):
                        alex_ids.append(author_id.replace("https://openalex.org/", ""))
                    else:
                        alex_ids.append(None)

                    # Extract name
                    names.append(author.get('display_name', None))

                    # Extract and clean ORCID
                    orcid = author.get('orcid', '')
                    if orcid and orcid.startswith("https://orcid.org/"):
                        orcids.append(orcid.replace("https://orcid.org/", ""))
                    else:
                        orcids.append(None)

                pub_tracker.at[idx, 'Pre_ListofAlexIds'] = alex_ids
                pub_tracker.at[idx, 'Pre_ListofNames'] = names
                pub_tracker.at[idx, 'Pre_ListofOrcids'] = orcids

            except Exception as e:
                print(f"Error processing DOI {doi_suffix}: {e}")


  pub_tracker.at[idx, 'Pre_OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")


Error processing DOI NA_review: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2FNA_review


## Merge publication and preprint information

In [13]:
# Merge OpenAlex_ID columns (scalar values)
pub_tracker['OpenAlex_ID'] = pub_tracker['OpenAlex_ID'].combine_first(pub_tracker['Pre_OpenAlex_ID'])

# Merge list-based columns
pub_tracker['ListofAlexIds'] = pub_tracker.apply(
    lambda row: row['ListofAlexIds'] if row['ListofAlexIds'] else row['Pre_ListofAlexIds'], axis=1)

pub_tracker['ListofNames'] = pub_tracker.apply(
    lambda row: row['ListofNames'] if row['ListofNames'] else row['Pre_ListofNames'], axis=1)

pub_tracker['ListofOrcids'] = pub_tracker.apply(
    lambda row: row['ListofOrcids'] if row['ListofOrcids'] else row['Pre_ListofOrcids'], axis=1)


## Get openalex id for work and lists of names, open alex ids, and orcids for zenodo works

In [14]:
import pandas as pd
import numpy as np

# Initialize your new columns
zenodo['OpenAlex_ID'] = np.nan
zenodo['ListofAlexIds'] = [[] for _ in range(len(zenodo))]
zenodo['ListofNames'] = [[] for _ in range(len(zenodo))]
zenodo['ListofOrcids'] = [[] for _ in range(len(zenodo))]

# Iterate through the DOIs
for idx, row in zenodo.iterrows():
    doi_suffix = row['pids.doi.identifier']
    if pd.notna(doi_suffix):
        try:
            doi_url = f"https://doi.org/{doi_suffix}"
            work_json = Works()[doi_url]

            # Extract OpenAlex ID
            work_id = work_json.get('id', '')
            if work_id.startswith("https://openalex.org/"):
                zenodo.at[idx, 'OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")

            # Extract authorship information
            alex_ids = []
            names = []
            orcids = []

            for author_info in work_json.get('authorships', []):
                author = author_info.get('author', {})

                # Extract and clean author ID
                author_id = author.get('id', '')
                if author_id.startswith("https://openalex.org/"):
                    alex_ids.append(author_id.replace("https://openalex.org/", ""))
                else:
                    alex_ids.append(None)

                # Extract name
                names.append(author.get('display_name', None))

                # Extract and clean ORCID
                orcid = author.get('orcid', '')
                if orcid and orcid.startswith("https://orcid.org/"):
                    orcids.append(orcid.replace("https://orcid.org/", ""))
                else:
                    orcids.append(None)

            zenodo.at[idx, 'ListofAlexIds'] = alex_ids
            zenodo.at[idx, 'ListofNames'] = names
            zenodo.at[idx, 'ListofOrcids'] = orcids

        except Exception as e:
            print(f"Error processing DOI {doi_suffix}: {e}")


  zenodo.at[idx, 'OpenAlex_ID'] = work_id.replace("https://openalex.org/", "")


Error processing DOI 10.5281/zenodo.12690495: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.12690495
Error processing DOI 10.5281/zenodo.7153540: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.7153540
Error processing DOI 10.5281/zenodo.8204348: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.8204348
Error processing DOI 10.5281/zenodo.8352764: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.8352764
Error processing DOI 10.5281/zenodo.11244913: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.11244913
Error processing DOI 10.5281/zenodo.10896376: 404 Client Error: Not Found for url: https://api.openalex.org/works/https%3A%2F%2Fdoi.org%2F10.5281%2Fzenodo.10896376
Error processing DOI 1

In [15]:
zenodo.to_excel('zenodo_df.xlsx')

#### save pub_tracker and zenodo tables as csv

In [46]:
pub_tracker.columns

Index(['Title', 'team', 'doi_pre', 'doi_pub', 'date_added_preprint',
       'date_added_publication', 'source_preprint', 'source_publication',
       'original', 'notes', 'Preprint in SF tracker',
       'publication in SF tracker', 'apc_amount', 'apc_status', 'OpenAlex_ID',
       'ListofAlexIds', 'ListofNames', 'ListofOrcids', 'Pre_OpenAlex_ID',
       'Pre_ListofAlexIds', 'Pre_ListofNames', 'Pre_ListofOrcids'],
      dtype='object')

In [47]:
pub_tracker.drop(columns=['ListofAlexIds', 'ListofNames', 'ListofOrcids', 'Pre_OpenAlex_ID',
       'Pre_ListofAlexIds', 'Pre_ListofNames', 'Pre_ListofOrcids'], inplace=True)

In [48]:
pub_tracker.to_csv("publication_tracker.csv", index=False)

In [49]:
zenodo.columns

Index(['id', 'created', 'pids.doi.identifier', 'metadata.title',
       'metadata.description', 'metadata.resource_type.title.en',
       'metadata.publication_date', 'metadata.creators.person_or_org.type',
       'metadata.creators.person_or_org.name', 'metadata.rights.id',
       'OpenAlex_ID', 'ListofAlexIds', 'ListofNames', 'ListofOrcids'],
      dtype='object')

In [50]:
zenodo.drop(columns=['ListofAlexIds', 'ListofNames', 'ListofOrcids'], inplace=True)

In [51]:
zenodo.to_csv("zenodo_tracker.csv", index=False)

## Create new df that maps people frmo roster to people in publications and zenodo works

In [19]:
import ast

for col in ['ListofAlexIds', 'ListofNames', 'ListofOrcids']:
        pub_tracker[col] = pub_tracker[col].apply(
                lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
        )
# Ensure all list columns have the same length per row
assert (pub_tracker[['ListofAlexIds', 'ListofNames', 'ListofOrcids']]
        .applymap(len).nunique(axis=1) == 1).all(), "List lengths are not equal in some rows!"

# Explode all 3 columns together while maintaining alignment
exploded_pub_tracker = pub_tracker.explode(['ListofAlexIds', 'ListofNames', 'ListofOrcids'], ignore_index=True)



  .applymap(len).nunique(axis=1) == 1).all(), "List lengths are not equal in some rows!"


In [None]:
# Ensure all list columns have the same length per row
assert (zenodo[['ListofAlexIds', 'ListofNames', 'ListofOrcids']]
        .applymap(len).nunique(axis=1) == 1).all(), "List lengths are not equal in some rows!"

# Explode all 3 columns together while maintaining alignment
exploded_zenodo = zenodo.explode(['ListofAlexIds', 'ListofNames', 'ListofOrcids'], ignore_index=True)



In [22]:
combined_teams = pd.read_csv("final_team_roster.csv")

In [None]:
import re

def remove_middle_names(full_name):
    # Split the name into parts
    full_name = str(full_name)  # Ensure it's a string
    parts = full_name.strip().split()
    if len(parts) <= 2:
        return full_name  # No middle name to remove
    return f"{parts[0]} {parts[-1]}"  # Keep only first and last

# Apply this to the ListofNames column
exploded_pub_tracker['ListofNames'] = exploded_pub_tracker['ListofNames'].apply(remove_middle_names)

# Preview the cleaned names
#print(exploded_pub_tracker['ListofNames'].head())

# Apply this to the ListofNames column
exploded_zenodo['ListofNames'] = exploded_zenodo['ListofNames'].apply(remove_middle_names)

# Preview the cleaned names
#print(exploded_zenodo['ListofNames'].head())

In [24]:
# Combine First Name and Last Name, strip any extra spaces
combined_teams["Full Name"] = (
    combined_teams["First Name"].fillna("").str.strip() + " " +
    combined_teams["Last Name"].fillna("").str.strip()
).str.strip()


In [25]:
import ast

# Convert strings that look like lists into actual lists
exploded_pub_tracker["ListofAlexIds"] = exploded_pub_tracker["ListofAlexIds"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
)


In [26]:
import ast

# Convert strings that look like lists into actual lists
exploded_zenodo["ListofAlexIds"] = exploded_zenodo["ListofAlexIds"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
)


### Add Role and Team info to tracker

### drop columns from exploded_pub_tracker and exploded zenodo_tracker to create tables for mapping people to publications and zenodo

In [34]:
exploded_pub_tracker.columns

Index(['Title', 'team', 'doi_pre', 'doi_pub', 'date_added_preprint',
       'date_added_publication', 'source_preprint', 'source_publication',
       'original', 'notes', 'Preprint in SF tracker',
       'publication in SF tracker', 'apc_amount', 'apc_status', 'OpenAlex_ID',
       'ListofAlexIds', 'ListofNames', 'ListofOrcids', 'Pre_OpenAlex_ID',
       'Pre_ListofAlexIds', 'Pre_ListofNames', 'Pre_ListofOrcids',
       'AlexID_match'],
      dtype='object')

In [35]:
exploded_pub_tracker.drop(columns=['Title', 'team', 'doi_pre', 'doi_pub', 'date_added_preprint',
       'date_added_publication', 'source_preprint', 'source_publication',
       'original', 'notes', 'Preprint in SF tracker',
       'publication in SF tracker', 'apc_amount', 'apc_status', 'Pre_OpenAlex_ID',
       'Pre_ListofAlexIds', 'Pre_ListofNames', 'Pre_ListofOrcids',
       'AlexID_match'], inplace=True)

In [37]:
exploded_pub_tracker.rename(columns={'OpenAlex_ID': 'WorkOpenAlex_ID', 'ListofAlexIds': 'PersonOpenAlex_ID', 'ListofNames': 'PersonName', 'ListofOrcids': 'PersonOrcid'}, inplace=True)

In [40]:
exploded_zenodo.columns

Index(['id', 'created', 'pids.doi.identifier', 'metadata.title',
       'metadata.description', 'metadata.resource_type.title.en',
       'metadata.publication_date', 'metadata.creators.person_or_org.type',
       'metadata.creators.person_or_org.name', 'metadata.rights.id',
       'OpenAlex_ID', 'ListofAlexIds', 'ListofNames', 'ListofOrcids',
       'AlexID_match'],
      dtype='object')

In [42]:
exploded_zenodo.drop(columns=['id', 'created', 'pids.doi.identifier', 'metadata.title',
       'metadata.description', 'metadata.resource_type.title.en',
       'metadata.publication_date', 'metadata.creators.person_or_org.type',
       'metadata.creators.person_or_org.name', 'metadata.rights.id',
       'AlexID_match'], inplace=True)

In [44]:
exploded_zenodo.rename(columns={'OpenAlex_ID': 'ZenodoOpenAlex_ID', 'ListofAlexIds': 'PersonOpenAlex_ID', 'ListofNames': 'PersonName', 'ListofOrcids': 'PersonOrcid'}, inplace=True)

## Add Role and Team info to pub_tracker

In [52]:
import pandas as pd
import numpy as np

# Split pub_df into two: one with valid ListofAlexIds, one without
valid_ids = exploded_pub_tracker[exploded_pub_tracker['PersonOpenAlex_ID'].notna() & (exploded_pub_tracker['PersonOpenAlex_ID'] != "")]
missing_ids = exploded_pub_tracker[exploded_pub_tracker['PersonOpenAlex_ID'].isna() | (exploded_pub_tracker['PersonOpenAlex_ID'] == "")]

# Merge only the valid part
merged_valid = valid_ids.merge(
    combined_teams[['AllOpenAlex_ID', 'Role', 'Team']],
    left_on='PersonOpenAlex_ID',
    right_on='AllOpenAlex_ID',
    how='left'
)

# Drop the now-redundant join column
merged_valid.drop(columns=['AllOpenAlex_ID'], inplace=True)

# For missing IDs, fill Role and Team with NaN explicitly
missing_ids = missing_ids.copy()
missing_ids['Role'] = np.nan
missing_ids['Team'] = np.nan

# Concatenate the cleaned frames back together
pub_df = pd.concat([merged_valid, missing_ids], ignore_index=True)


In [53]:
def remove_middle_name(full_name):
    if pd.notna(full_name) and isinstance(full_name, str):
        parts = full_name.strip().split()
        if len(parts) >= 2:
            return f"{parts[0]} {parts[-1]}"  # Keep only first and last name
        return parts[0]  # If there's only one part, return as is
    return None

# Apply to the PersonName column
pub_df["FirstLastName"] = pub_df["PersonName"].apply(remove_middle_name)


In [55]:
# Map team from pub_tracker to pub_df based on matching OpenAlex_ID and WorkOpenAlex_ID
pub_df = pub_df.merge(
    pub_tracker[['OpenAlex_ID', 'team']],
    left_on='WorkOpenAlex_ID',
    right_on='OpenAlex_ID',
    how='left',
    suffixes=('', '_from_tracker')
)

# If pub_df already has a 'team' column, update only where it's missing
if 'team_x' in pub_df.columns and 'team_y' in pub_df.columns:
    pub_df['team'] = pub_df['team_x'].combine_first(pub_df['team_y'])
    pub_df.drop(['team_x', 'team_y'], axis=1, inplace=True)
elif 'team_y' in pub_df.columns:
    pub_df['team'] = pub_df['team_y']
    pub_df.drop(['team_y'], axis=1, inplace=True)

# Drop the extra OpenAlex_ID column from merge
if 'OpenAlex_ID' in pub_df.columns:
    pub_df.drop(['OpenAlex_ID'], axis=1, inplace=True)

In [58]:
import pandas as pd

# Only process rows in pub_df where Role is blank
pub_df_to_update = pub_df[pub_df['Role'].isna()].copy()

# Standardize combined_teams for matching
combined_teams['FullName'] = combined_teams['First Name'].str.strip() + ' ' + combined_teams['Last Name'].str.strip()
combined_teams['StrippedTeam'] = combined_teams['Team'].str.replace('Team ', '', regex=False).str.strip()

# Loop through the rows to perform rough match and update
for idx, row in pub_df_to_update.iterrows():
    name = row['FirstLastName']
    team = str(row['team']).strip()

    match = combined_teams[(combined_teams['FullName'] == name) & 
                         (combined_teams['StrippedTeam'] == team)]

    if not match.empty:
        matched_row = match.iloc[0]  # Use the first match found
        pub_df.at[idx, 'Role'] = matched_row['Role']
        pub_df.at[idx, 'Team'] = matched_row['Team']  # This will be the full 'Team AAA' style



In [59]:
pub_df["PersonName"] = pub_df["FirstLastName"]

In [60]:
pub_df.drop(columns=['FirstLastName'], inplace=True)

In [62]:
pub_df.drop(columns=['team'], inplace=True)

## Add people in teams not on works

In [64]:
import pandas as pd

# Create comparable columns in both dataframes
combined_teams['PersonName'] = combined_teams['First Name'].str.strip() + ' ' + combined_teams['Last Name'].str.strip()

# Select the relevant columns from both dataframes
pub_keys = pub_df[['PersonName', 'Role', 'Team']].dropna()
personnel_keys = combined_teams[['PersonName', 'Role', 'Team']].dropna()

# Perform anti-join: find rows in combined_teams that are not in pub_df
merged = personnel_keys.merge(pub_keys.drop_duplicates(), 
                               on=['PersonName', 'Role', 'Team'], 
                               how='left', indicator=True)

# Only rows not found in pub_df
missing_rows = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

# Get corresponding full rows from combined_teams
rows_to_add = combined_teams.merge(missing_rows, 
                                 on=['PersonName', 'Role', 'Team'], 
                                 how='inner')

# Select and rename the columns to match pub_df format
rows_to_add = rows_to_add[['PersonName', 'Role', 'Team']]
rows_to_add['team'] = rows_to_add['Team'].str.replace('Team ', '', regex=False)  # create lowercase version

# Append to pub_df
pub_df = pd.concat([pub_df, rows_to_add], ignore_index=True)


## Locate OpenAlex ID for rows in publication people tracker missing Alex Ids

In [67]:
# Ensure matching keys are aligned
combined_teams['PersonName'] = combined_teams['First Name'].str.strip() + ' ' + combined_teams['Last Name'].str.strip()
combined_teams['Team_clean'] = combined_teams['Team']

# Loop through rows where PersonOpenAlex_ID is missing
for idx, row in pub_df[pub_df['PersonOpenAlex_ID'].isna()].iterrows():
    first_last = row['PersonName']
    team_clean = str(row['Team']).strip()  # Safely convert to string and strip

    match = combined_teams[
        (combined_teams['PersonName'] == first_last) &
        (combined_teams['Team_clean'] == team_clean)
    ]

    if not match.empty:
        pub_df.at[idx, 'PersonOpenAlex_ID'] = match.iloc[0]['AllOpenAlex_ID']


## Add corresponding author (yes/no) column to people to publication mapping

In [None]:
# Create temp column in pub_tracker for matching

In [69]:
# Add corresponding_authors column to pub_tracker
pub_tracker['corresponding_authors'] = pub_tracker['OpenAlex_ID'].apply(
    lambda openalex_id: Works()[f"https://openalex.org/{openalex_id}"].get("corresponding_author_ids", []) if pd.notna(openalex_id) else []
)

In [74]:
# Remove 'https://openalex.org/' from each item in the corresponding_authors list for every row
pub_tracker['corresponding_authors'] = pub_tracker['corresponding_authors'].apply(
    lambda lst: [x.replace('https://openalex.org/', '') if isinstance(x, str) else x for x in lst]
)

In [76]:
# Create a mapping from OpenAlex_ID to corresponding_authors list
corresponding_authors_map = pub_tracker.set_index('OpenAlex_ID')['corresponding_authors'].to_dict()

def is_corresponding_author(row):
    work_id = row['WorkOpenAlex_ID']
    person_id = row['PersonOpenAlex_ID']
    if pd.notna(work_id) and pd.notna(person_id):
        authors_list = corresponding_authors_map.get(work_id, [])
        return "Yes" if person_id in authors_list else "No"
    return "No"

pub_df['IsCorrespondingAuthor'] = pub_df.apply(is_corresponding_author, axis=1)

In [78]:
pub_tracker

Unnamed: 0,Title,team,doi_pre,doi_pub,date_added_preprint,date_added_publication,source_preprint,source_publication,original,notes,Preprint in SF tracker,publication in SF tracker,apc_amount,apc_status,OpenAlex_ID,corresponding_authors
0,"Three-step docking by WIPI2, ATG16L1 and ATG3 ...",Hurley,10.1101/2023.07.17.549391,10.1126/sciadv.adj8027,before_doi_tracker,2024-02-07 00:00:00,before_doi_tracker,other(email to DS),1,,1.0,1,4500.0,Paid,W4391629214,"[A5061877082, A5061450716, A5059945365]"
1,Multi-ancestry genome-wide meta-analysis in Pa...,GP2,10.1101/2022.08.04.22278432,10.1038/s41588-023-01584-8,before_doi_tracker,2024-02-07 00:00:00,before_doi_tracker,other(gp2_blog),1,,1.0,1,,,W4390345401,[A5031512056]
2,Fluorescence Microscopy Shadow Imaging for Neu...,Vila,review,10.3389/fncel.2024.1330100,NA_review,2024-02-07 00:00:00,NA_review,hub,0,,,,3295.0,Paid,W4391841711,[]
3,3D Bioprinting of Human Neural Tissues with Fu...,Scherzer,10.1101/2024.01.18.576289,10.1016/j.stem.2023.12.009,before_doi_tracker,2024-02-09 00:00:00,before_doi_tracker,google_alert,1,,1.0,1,,,W4391450102,[A5101605513]
4,Comprehensive structural variant detection: Fr...,Voet,10.1101/2022.04.04.487055,10.1038/s41587-023-02024-y,before_doi_tracker,2024-02-09 00:00:00,before_doi_tracker,hub,1,,1.0,1,,,W4390500910,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,Current safety recommendations for handling mo...,Volpicelli-Daley,does_not_exist,10.1016/j.nbd.2025.106820,,2025-06-05 00:00:00,,lens,0,,,1,,,W4407000621,[]
559,"Aging, cellular senescence and Parkinson's dis...",Lee,NA_review,10.1177/1877718X251316552,NA_review,2025-06-06 00:00:00,NA_review,lens,0,,,1,,,W4407112723,[A5037205059]
560,Targeting mitophagy in neurodegenerative diseases,Alessi,NA_review,10.1038/s41573-024-01105-0,NA_review,2025-06-06 00:00:00,NA_review,lens,0,,,1,,,W4406342572,[]
561,How Membrane Contact Sites Shape the Phagophore,Harper,NA_review,10.1177/25152564231162495,NA_review,2025-06-06 00:00:00,NA_review,lens,0,,,1,,,W4376459430,[A5006303094]


In [80]:
pub_df

Unnamed: 0,WorkOpenAlex_ID,PersonOpenAlex_ID,PersonName,PersonOrcid,Role,Team,IsCorrespondingAuthor
0,W4391629214,A5028861095,Shanlin Rao,0000-0003-4892-5523,,,No
1,W4391629214,A5071900726,Lisa Strong,0000-0002-4293-8131,,,No
2,W4391629214,A5041244181,Xuefeng Ren,0000-0002-4822-4316,Key Personnel,Hurley,No
3,W4391629214,A5033276727,Marvin Skulsuppaisarn,0000-0003-4041-7014,,,No
4,W4391629214,A5061877082,Michael Lazarou,0000-0003-2150-5545,Co-PI,Hurley,Yes
...,...,...,...,...,...,...,...
7065,,A5102819545,Hannah Clarke,,Key Personnel,Wood,No
7066,,A5008450447,Rebeka Popovic,,Key Personnel,Wood,No
7067,,,Joe Robin,,Key Personnel,Wood,No
7068,,A5082240217,Alexandre Almeida,,Key Personnel,Wood,No


In [79]:
pub_df.drop(columns=['team'], inplace=True)

## Save tables as csv

In [81]:
pub_df.to_csv('people_to_publication_mapping.csv')

In [45]:
exploded_zenodo.to_csv('people_to_zenodo_mapping.csv')

## Citation table

In [3]:
pub_tracker = pd.read_csv("publication_tracker.csv")
pub_people = pd.read_csv("people_to_publication_mapping.csv")
combined_teams = pd.read_csv("final_team_roster.csv")

In [4]:
network_df = pub_tracker.merge(pub_people[['WorkOpenAlex_ID', 'PersonOpenAlex_ID', 'PersonName',
       'PersonOrcid', 'Role', 'Team', 'IsCorrespondingAuthor']],
       left_on='OpenAlex_ID',
       right_on='WorkOpenAlex_ID',
       how = 'left').drop(columns=['WorkOpenAlex_ID'])

In [5]:
network_df.columns

Index(['Title', 'team', 'doi_pre', 'doi_pub', 'date_added_preprint',
       'date_added_publication', 'source_preprint', 'source_publication',
       'original', 'notes', 'Preprint in SF tracker',
       'publication in SF tracker', 'apc_amount', 'apc_status', 'OpenAlex_ID',
       'PersonOpenAlex_ID', 'PersonName', 'PersonOrcid', 'Role', 'Team',
       'IsCorrespondingAuthor'],
      dtype='object')

In [None]:
# List of columns that stayed the same (used for grouping)
group_cols = ['Title', 'team', 'doi_pre', 'doi_pub', 'date_added_preprint',
       'date_added_publication', 'source_preprint', 'source_publication',
       'original', 'notes', 'Preprint in SF tracker',
       'publication in SF tracker', 'apc_amount', 'apc_status', 'OpenAlex_ID']

# Columns that were unlisted (you want to re-list them)
list_cols = ['PersonOpenAlex_ID', 'PersonName', 'PersonOrcid', 'Role', 'Team',
       'IsCorrespondingAuthor']

# Sort the DataFrame to preserve original order before grouping
df_sorted = network_df.sort_values(by=group_cols).copy()

# Group and aggregate the list columns into lists
aggregated_df = df_sorted.groupby(group_cols, as_index=False).agg({
    col: lambda x: list(x) for col in list_cols
})



In [34]:
aggregated_df.size

84

column with list of references

In [39]:
def _safe_get_referenced(openalex_id):
    # handle NaN / empty
    if not openalex_id or (isinstance(openalex_id, float) and pd.isna(openalex_id)):
        return []
    openalex_id = str(openalex_id).strip()
    # strip full URL if present
    if openalex_id.startswith("https://openalex.org/"):
        openalex_id = openalex_id.replace("https://openalex.org/", "")
    try:
        work = Works()[openalex_id]
        return work.get("referenced_works", []) or []
    except Exception:
        # return empty list on any fetch error (404, rate limit, etc.)
        return []

network_df["ReferencedWorks"] = network_df["OpenAlex_ID"].apply(_safe_get_referenced)

In [40]:
network_df["ReferencedWorks"] = network_df["ReferencedWorks"].apply(
    lambda lst: [item.replace("https://openalex.org/", "") if isinstance(item, str) else item for item in lst]
)

column for number of references from asap network

In [41]:
# Create a set of all OpenAlex_IDs for fast lookup
openalex_id_set = set(network_df["OpenAlex_ID"])

# Count how many referenced works are also in OpenAlex_ID
network_df["NumInternalReferences"] = network_df["ReferencedWorks"].apply(
    lambda refs: sum(ref in openalex_id_set for ref in refs)
)

column for num times each work has been cited by other works 

In [43]:
def _safe_get_cited_by_count(openalex_id):
    # handle missing/NaN/empty
    if not openalex_id or (isinstance(openalex_id, float) and pd.isna(openalex_id)):
        return 0
    openalex_id = str(openalex_id).strip()
    # strip full URL if present
    if openalex_id.startswith("https://openalex.org/"):
        openalex_id = openalex_id.replace("https://openalex.org/", "")
    try:
        work = Works()[openalex_id]
        return work.get("cited_by_count", 0) if work else 0
    except Exception:
        # On any fetch error (404, rate limit, etc.) return 0
        return 0

network_df["CitedByCount"] = network_df["OpenAlex_ID"].apply(_safe_get_cited_by_count)

column for num times each work has been cited by works in the asap network

In [44]:
from collections import Counter

# For each row, count how many times its OpenAlex_ID appears in all ReferencedWorks lists

# Flatten all ReferencedWorks into a single list
all_referenced_ids = [ref for refs in network_df["ReferencedWorks"] for ref in refs]
referenced_counter = Counter(all_referenced_ids)

# Map each OpenAlex_ID to its count in referenced_counter
network_df["CitedByInternal"] = network_df["OpenAlex_ID"].map(lambda x: referenced_counter.get(x, 0))

In [45]:
import pandas as pd
import ast
import numpy as np
import re

# Step 1: Custom parser for stringified lists that contain nan
def parse_team(val):
    if isinstance(val, list):  # Already parsed
        return val
    if pd.isna(val):  # True NaN
        return []
    if isinstance(val, str) and val.strip().startswith('['):
        # Replace bare `nan` (not quoted) with 'None' so ast.literal_eval won't crash
        val_cleaned = re.sub(r'\bnan\b', 'None', val)
        try:
            parsed = ast.literal_eval(val_cleaned)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            pass
    return [val] if pd.notna(val) else []

# Step 2: Remove real NaNs and deduplicate
def clean_team_list(team_list):
    seen = set()
    cleaned = []
    for item in team_list:
        if item is not None and pd.notna(item) and item not in seen:
            seen.add(item)
            cleaned.append(item)
    return cleaned

# Apply the functions
network_df['Team'] = network_df['Team'].apply(parse_team)
network_df['CleanedTeam'] = network_df['Team'].apply(clean_team_list)


In [46]:
import pandas as pd

# Ensure ReferencedWorks is parsed as a list (like we did with Team)
def parse_references(val):
    if isinstance(val, list):
        return val
    if pd.isna(val):
        return []
    if isinstance(val, str) and val.strip().startswith('['):
        try:
            return ast.literal_eval(val)
        except Exception:
            return []
    return [val]

network_df['ReferencedWorks'] = network_df['ReferencedWorks'].apply(parse_references)

# Step 1: Build a mapping from OpenAlex_ID to CleanedTeam
id_to_teams = network_df.set_index('OpenAlex_ID')['CleanedTeam'].to_dict()

# Step 2: For each row, check which other rows cite this one's OpenAlex_ID
# We'll reverse the logic: For each publication, collect the teams from rows where this ID appears in their ReferencedWorks
from collections import defaultdict

# Create a mapping from OpenAlex_ID to list of citing teams
teams_that_cited = defaultdict(list)

for _, row in network_df.iterrows():
    citing_teams = row['CleanedTeam']
    for ref_id in row['ReferencedWorks']:
        if pd.notna(ref_id):
            teams_that_cited[ref_id].extend(citing_teams)

# Step 3: Assign to new column, removing duplicates
def get_citing_teams(pub_id):
    raw_teams = teams_that_cited.get(pub_id, [])
    seen = set()
    return [team for team in raw_teams if pd.notna(team) and not (team in seen or seen.add(team))]

network_df['TeamsThatCited'] = network_df['OpenAlex_ID'].apply(get_citing_teams)


In [47]:
# Step 1: Create lookup from OpenAlex_ID to CleanedTeam
id_to_teams = network_df.set_index('OpenAlex_ID')['CleanedTeam'].to_dict()

# Step 2: Define functions to compute the new columns
def get_referenced_teams(ref_list):
    all_teams = []
    for ref_id in ref_list:
        if ref_id in id_to_teams:
            teams = id_to_teams[ref_id]
            if isinstance(teams, list):
                all_teams.extend([t for t in teams if pd.notna(t)])
    # Deduplicate while preserving order
    seen = set()
    return [t for t in all_teams if not (t in seen or seen.add(t))]

def get_int_references(ref_list):
    return [ref_id for ref_id in ref_list if ref_id in id_to_teams]

# Step 3: Apply to the DataFrame
network_df['ReferencedTeams'] = network_df['ReferencedWorks'].apply(get_referenced_teams)
network_df['IntReferences'] = network_df['ReferencedWorks'].apply(get_int_references)


In [49]:
import ast
import re
import pandas as pd

# Fix FirstLastName to be a true list (from its stringified version)
def parse_name_list(val):
    if isinstance(val, list):
        return val
    if pd.isna(val):
        return []
    if isinstance(val, str) and val.strip().startswith('['):
        try:
            val_cleaned = re.sub(r'\bnan\b', 'None', val)
            parsed = ast.literal_eval(val_cleaned)
            return parsed if isinstance(parsed, list) else []
        except Exception:
            return []
    return [val]

network_df['PersonName'] = network_df['PersonName'].apply(parse_name_list)
network_df['ReferencedWorks'] = network_df['ReferencedWorks'].apply(parse_name_list)


In [51]:
network_df.to_csv("halfway_through+save.csv")

In [52]:
id_to_names = network_df.set_index('OpenAlex_ID')['PersonName'].to_dict()


In [53]:
def get_referenced_individuals(ref_list):
    names = []
    for ref_id in ref_list:
        if ref_id in id_to_names:
            these_names = id_to_names[ref_id]
            if isinstance(these_names, list):
                names.extend([n for n in these_names if pd.notna(n)])
    # Deduplicate while preserving order
    seen = set()
    return [name for name in names if name not in seen and not seen.add(name)]

network_df['ReferencedIndividuals'] = network_df['ReferencedWorks'].apply(get_referenced_individuals)


In [54]:
def is_empty(val):
    if val is None:
        return True
    # If it's a list, tuple, or set, check length
    if isinstance(val, (list, tuple, set)):
        return len(val) == 0
    # If it's a numpy array or pandas Series, check length
    if hasattr(val, "__len__") and not isinstance(val, str):
        try:
            return len(val) == 0
        except Exception:
            pass
    # If it's a string like '[]', 'nan', etc.
    if isinstance(val, str):
        val = val.strip()
        return val in ('', '[]', 'nan')
    # For scalars, use pd.isna
    try:
        return pd.isna(val)
    except Exception:
        return False

import time

filled_count = 0

for idx, row in network_df.iterrows():
    openalex_id = str(row['OpenAlex_ID']).strip()
    if openalex_id.startswith("W") and is_empty(row['ReferencedWorks']):
        try:
            work = Works()[openalex_id]
            if work and 'referenced_works' in work:
                network_df.at[idx, 'ReferencedWorks'] = work['referenced_works']
                filled_count += 1
        except Exception as e:
            print(f"❌ Failed for {openalex_id}: {e}")
        time.sleep(0.25)  # avoid rate limiting

print(f"✅ Filled ReferencedWorks for {filled_count} rows.")


❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: Not Found for url: https://api.openalex.org/works/W4226263121
❌ Failed for W4226263121: 404 Client Error: No

In [57]:
# Group by OpenAlex_ID and collect all unique teams from the Team column
def unique_teams(team_series):
    seen = set()
    teams = []
    for val in team_series:
        if isinstance(val, list):
            for t in val:
                if pd.notna(t) and t not in seen:
                    teams.append(t)
                    seen.add(t)
        elif pd.notna(val) and val not in seen:
            teams.append(val)
            seen.add(val)
    return teams

cleaned_teams = network_df.groupby('OpenAlex_ID')['Team'].apply(unique_teams)

# Assign back to the DataFrame (align by OpenAlex_ID)
network_df['CleanedTeams'] = network_df['OpenAlex_ID'].map(cleaned_teams)

In [58]:
network_df['OpenAlex_ID'].nunique()

561

In [59]:
network_df.to_csv("citation_table_needs_dedup.csv")

## Create table for hit citations

In [73]:
df2020 = pd.read_csv('citation ranking data/new data/oa_c5rank_2020.csv.gz', compression='gzip')

In [74]:
df2021 = pd.read_csv('citation ranking data/new data/oa_c5rank_2021.csv.gz', compression='gzip')

In [75]:
df2022 = pd.read_csv('citation ranking data/new data/oa_c5rank_2022.csv.gz', compression='gzip')

In [76]:
df2023 = pd.read_csv('citation ranking data/new data/oa_c5rank_2023.csv.gz', compression='gzip')

In [77]:
df2024 = pd.read_csv('citation ranking data/new data/oa_c5rank_2024.csv.gz', compression='gzip')

In [78]:
df2025 = pd.read_csv('citation ranking data/new data/oa_c5rank_2025.csv.gz', compression='gzip')

In [79]:
combined_citation_df = pd.concat([
    df2020.assign(source='d0'),
    df2021.assign(source='d1'),
    df2022.assign(source='d2'),
    df2023.assign(source='d3'),
    df2024.assign(source='d4'),
    df2025.assign(source='d5')
], ignore_index=True)


In [80]:
combined_citation_df['OpenAlex_ID'] = 'W' + combined_citation_df['PublicationId'].astype(str)

In [81]:
combined_citation_df.to_csv('hit_citation_table.csv', index=False)

In [82]:
combined_citation_df.head()

Unnamed: 0,PublicationId,TopicId,SubFieldId,FieldId,Year,C5,TopicIdC5Rank,SubFieldIdC5Rank,FieldIdC5Rank,source,OpenAlex_ID
0,3000632562,11185,0,0,2020,131.0,0.989699,0.994733,0.994733,d0,W3000632562
1,3000632562,11007,0,0,2020,131.0,0.977364,0.994734,0.994734,d0,W3000632562
2,3000632562,11225,0,0,2020,131.0,0.99715,0.994734,0.994734,d0,W3000632562
3,3034450003,11873,0,0,2020,122.0,0.997792,0.993974,0.993974,d0,W3034450003
4,3034450003,12146,0,0,2020,122.0,0.998762,0.993975,0.993975,d0,W3034450003
