In [13]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('../data/Hemonc_new.csv')


df_demo = df.head()
df["pmids"][2]

'18955563, 20418244'

In [12]:
df_demo

Unnamed: 0,evidence,pubmed_urls,pmids,question 1,question 2,question 3,question 4,question 5,question 6,question 7,...,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3,NCT
0,"A double-blind, placebo-controlled, randomized...",https://pubmed.ncbi.nlm.nih.gov/20931299/,20931299,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that most clearly describes th...,Select the statement that best encapsulates th...,...,What option most accurately summarizes how Cis...,Choose the option that most effectively highli...,Which option best outlines the effectiveness o...,Identify the option that conveys the most accu...,Select the statement that appropriately descri...,2,superior,inferior,no difference,NCT00532818
1,"A double-blind, placebo-controlled, randomized...",https://pubmed.ncbi.nlm.nih.gov/20931299/,20931299,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that most clearly describes th...,Select the statement that best encapsulates th...,...,What option most accurately summarizes how Cis...,Choose the option that most effectively highli...,Which option best outlines the effectiveness o...,Identify the option that conveys the most accu...,Select the statement that appropriately descri...,1,superior,inferior,no difference,NCT00532818
2,Thalidomide-dexamethasone compared with melpha...,"https://pubmed.ncbi.nlm.nih.gov/18955563/, htt...","18955563, 20418244",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that most clearly describes th...,Select the statement that best encapsulates th...,...,What option most accurately summarizes how Int...,Choose the option that most effectively highli...,Which option best outlines the effectiveness o...,Identify the option that conveys the most accu...,Select the statement that appropriately descri...,1,superior,inferior,no difference,NCT00205751
3,Thalidomide-dexamethasone compared with melpha...,"https://pubmed.ncbi.nlm.nih.gov/18955563/, htt...","18955563, 20418244",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that most clearly describes th...,Select the statement that best encapsulates th...,...,What option most accurately summarizes how Int...,Choose the option that most effectively highli...,Which option best outlines the effectiveness o...,Identify the option that conveys the most accu...,Select the statement that appropriately descri...,2,superior,inferior,no difference,NCT00205751
4,"Randomized, Double-Blind, Placebo-Controlled P...",https://pubmed.ncbi.nlm.nih.gov/27298414/,27298414,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that most clearly describes th...,Select the statement that best encapsulates th...,...,What option most accurately summarizes how Tas...,Choose the option that most effectively highli...,Which option best outlines the effectiveness o...,Identify the option that conveys the most accu...,Select the statement that appropriately descri...,1,superior,inferior,no difference,NCT01234311


In [23]:
import pandas as pd
import json
from Bio import Entrez

# Set your email for NCBI
Entrez.email = "yuxin102@mit.edu"

def fetch_pubmed_data(pubmed_id: str) -> dict:
    """
    Fetches author list for a given PubMed ID.
    
    Args:
        pubmed_id: The PubMed ID string.
        
    Returns:
        A dictionary containing 'authors' (pipe-separated string)
    """
    authors_str = ""
    
    try:
        # Fetch the PubMed record in XML format
        handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")
        records = Entrez.read(handle)
        handle.close()
        
        # Check if records were fetched and contain the expected structure
        if records and "PubmedArticle" in records:
            article = records["PubmedArticle"][0].get("MedlineCitation", {}).get("Article", {})
            
            # Extract the author list
            author_list = []
            if "AuthorList" in article:
                for author in article["AuthorList"]:
                    # Check if the author has separate first and last names
                    if "ForeName" in author and "LastName" in author:
                        full_name = f"{author['ForeName']} {author['LastName']}"
                    elif "CollectiveName" in author:
                        full_name = author["CollectiveName"]
                    else:
                        # Fallback if name parts are missing
                        full_name = author.get("LastName", "Name Not Available")
                    author_list.append(full_name)
            
            authors_str = "|".join(author_list)
    
    except Exception as e:
        print(f"Error fetching data for PubMed ID {pubmed_id}: {e}")
        return {"authors": ""}
    
    return {"authors": authors_str}

def add_authors_column(df):
    """
    Add an 'authors' column to the dataframe mapping PMIDs to authors.
    
    Args:
        df: DataFrame containing a 'pmids' column with comma-separated PMIDs
            
    Returns:
        DataFrame with new 'authors' column
    """
    # Function to process each row's PMIDs
    def process_pmids(pmid_string):
        if not isinstance(pmid_string, str) or not pmid_string.strip():
            return json.dumps({})
            
        # Parse comma-separated PMIDs
        pmids = [pmid.strip() for pmid in pmid_string.split(',') if pmid.strip()]
        
        # Create dictionary mapping PMIDs to authors
        result_dict = {}
        for pmid in pmids:
            try:
                # Get the dictionary returned by fetch_pubmed_data
                author_data = fetch_pubmed_data(pmid)
                # Extract just the authors string from the result dictionary
                result_dict[pmid] = author_data["authors"]
            except Exception as e:
                print(f"Error processing PMID {pmid}: {e}")
                result_dict[pmid] = "ERROR"
        
        # Return as JSON string
        return json.dumps(result_dict, ensure_ascii=False)

    
    # Apply the function to create the new column
    df_copy = df.copy()
    df_copy['authors'] = df_copy['pmids'].apply(process_pmids)
    return df_copy

# Read your CSV file
df = pd.read_csv('../data/Hemonc_new.csv')

# Add the authors column (test with only 5 rows to check)
df_with_authors = add_authors_column(df)

# Save the updated dataframe
df_with_authors.to_csv('../data/Hemonc_new_with_authors.csv', index=False)

# Print a sample to verify the output
print(df_with_authors[['pmids', 'authors']].head())

Error fetching data for PubMed ID 31070690: HTTP Error 429: Too Many Requests
Error fetching data for PubMed ID 27132696: <urlopen error [Errno 8] nodename nor servname provided, or not known>
Error fetching data for PubMed ID 29054815: <urlopen error [Errno 8] nodename nor servname provided, or not known>
