In [12]:
import numpy as np
import requests
import time
from typing import List

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [13]:
projects = pd.read_csv("https://raw.githubusercontent.com/radxrad/dbgap-reporter/main/dev/radx-projects.csv")

# New Section

In [14]:
projects.head()

Unnamed: 0,radx_project,dbgap_accession,project_num,core_project_num,project_serial_num,principal_investigator,study_title
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (RADx-rad): Rolosense: An Innovative Platform for Automatic Mobile Phone Readout of Active SARS-CoV-2
1,RADx-UP,phs002775.v1.p1,U54GM115677,U54GM115677,GM115677,Sharon Rounds,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Developing a Realtime Monitoring System and Program to Improve COVID-19 Testing for Latinx Populations
2,RADx-UP,phs002761.v1.p1,3UL1TR003167-02S1,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Addressing COVID-19 Testing Disparities in Vulnerable Populations Using a Community JITAI (Just in Time Adaptive Intervention) Approach-A UTHealth A UTHealth CTSA Program
3,RADx-UP,phs002761.v1.p1,3UL1TR003167-03S3,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Addressing COVID-19 Testing Disparities in Vulnerable Populations Using a Community JITAI (Just in Time Adaptive Intervention) Approach-A UTHealth A UTHealth CTSA Program
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Assessing Testing Strategies for Safe Return to K-12 Schools in an Underserved Population


In [15]:
# Parameters for NIH Reporter Search
PROJECT_LIMIT = 100 # maximum number of records for project search


def get_projects(core_project_numbers: List[str], chunk_size: int = PROJECT_LIMIT) -> pd.DataFrame:
    """
    Retrieve project data in batches from a list of core project numbers.

    This function divides the list of core project numbers into smaller chunks
    to avoid overloading the API and then fetches data for each chunk. The results
    are concatenated to form a final dataset of project information.

    Parameters
    ----------
    core_project_numbers : List[str]
        A list of core project numbers to retrieve data for.

    chunk_size : int, optional
        The maximum number of core project numbers to include in each API request.
        Defaults to PROJECT_LIMIT.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing project data retrieved from the API.

    Examples
    --------
    >>> core_project_numbers = ["U01AA029316", "R01DC016112"]
    >>> get_projects(core_project_numbers)
        appl_id  subproject_id  fiscal_year        project_num  project_serial_num  ...
    0   10320986          None         2022    4U01AA029316-02            AA029316  ...
    1   ...
    2   10129336          None         2021    5R01DC016112-05            DC016112  ...
    ...
    """
    # Divide the core_project_numbers into chunks
    chunks = create_chunks(core_project_numbers, chunk_size)
    print(chunks)

    # Fetch project data for each chunk and concatenate the results
    batches = [search_project(chunk, chunk_size) for chunk in chunks]

    # Normalize the JSON data and concatenate into a DataFrame
    projects = [pd.json_normalize(data["results"]) for data in batches]

    # Concatenate the extracted data into a DataFrame
    return pd.concat(projects)


def create_chunks(data, chunk_size):
    """
    Split a list into smaller chunks of a specified size.

    Args:
        data (list): The input list to be divided into chunks.
        chunk_size (int): The maximum size of each chunk.

    Returns:
        list: A list of chunks, where each chunk is a sublist of 'data'.

    Example:
        >>> data = [1, 2, 3, 4, 5, 6, 7, 8]
        >>> chunk_size = 3
        >>> create_chunks(data, chunk_size)
        [[1, 2, 3], [4, 5, 6], [7, 8]]
    """
    # split list into chunks of max size: chunk_size
    return [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]


def search_project(core_project_numbers, chunk_size):
    print("search_projects:", core_project_numbers)
    PROJECTS_URL = "https://api.reporter.nih.gov/v2/projects/search"
    HEADERS = {"accept": "application/json"}
    params = {"criteria": {"project_nums": core_project_numbers}, "limit": chunk_size, "include_active_projects": "true"}

    try:
        response = requests.post(PROJECTS_URL, headers=HEADERS, json=params)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.HTTPError as error:
        print(f"ERROR: nih_reporter HTTP error: {error}")
    except requests.exceptions.RequestException as error:
        print(f"ERROR: nih_reporter: {error}")

    time.sleep(1)
    return data

In [16]:
project_num = set(projects["core_project_num"].unique())

In [17]:
grants = get_projects(list(project_num))

[['R44DE030842', 'U18TR003775', '75N91020C00040', 'U01DA053941', 'UG1DA050071', 'U01MD018294', 'UL1TR002538', 'U54MD007601', 'U54MD007597', 'U54CA132384', 'UM1AI069423', 'U18TR003778', 'U18TR003812', 'R61HD105590', 'P42ES010337', 'U01DA053893', 'UH3CA233314', 'U01DA056000', 'U01DA053903', 'UL1TR003167', 'OT2HD107555', 'UG1DA050066', 'R01DK130067', 'U24LM013755', 'UG1DA050072', 'R42DE030832', 'U01AI169477', 'P30AI027763', 'U01HL152410', 'U18TR003795', 'U01MD017421', 'U01AA029324', 'R01CA220591', 'U01DA057849', '75N91020C00037', 'U01DA055982', 'U01DC019578', 'OT2HD107558', 'R01DA037628', 'R44DE030841', 'U01MD017437', 'UL1TR002366', 'OT2HD108110', 'R01DE031114', 'UL1TR002733', '75N91020C00035', 'R01DA036749', 'U01MD017432', 'R01AA024409', 'ZIAES103366', 'P30ES023513', 'UL1TR001436', 'OT2HD108101', 'R21MH122010', 'U01MD017436', 'UL1TR003017', 'R61HD105591', 'UH3AI133669', 'R01HD091218', 'U01GM132175', 'U01MD017414', 'U01AA029316', 'U01AA029348', 'U01DC019579', 'U54MD000502', 'R37CA245716',

In [18]:
grant_num = set(grants["core_project_num"].unique())
print("Matching grants:", len(grant_num))

Matching grants: 65


In [19]:
missing = list(project_num - grant_num)
print("Missing grants:", len(missing))

Missing grants: 95


In [20]:
projects[projects["core_project_num"].isin(missing)]

Unnamed: 0,radx_project,dbgap_accession,project_num,core_project_num,project_serial_num,principal_investigator,study_title
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (RADx-rad): Rolosense: An Innovative Platform for Automatic Mobile Phone Readout of Active SARS-CoV-2
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Assessing Testing Strategies for Safe Return to K-12 Schools in an Underserved Population
6,RADx-UP,phs002800.v1.p1,1OT2HD107553-01,OT2HD107553,HD107553,John Foxe,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): COV-IDD: Testing for COVID-19 in High Risk Children with Intellectual and Developmental Disabilities
7,RADx-UP,phs002760.v1.p1,R01HL151292-01S1,R01HL151292,HL151292,Robert Gross,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): COVID Self-Testing Through Rapid Network Distribution (C-STRAND)
9,RADx-UP,phs002878.v1.p1,1OT2HD108112-01,OT2HD108112,HD108112,Susan Kiene,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Communities Fighting COVID!: Returning Our Kids Back to School Safely
10,RADx-UP,phs002776.v1.p1,3R01DA036749-05S1,R01DA036749,DA036749,Janice Tsoh,Rapid Acceleration of Diagnostics - Underserved Populations (RADx-UP): Getting Asian Americans INFORMED To Facilitate COVID-19 Testing and Vaccination
12,RADx-rad,phs002778.v1.p1,1U18TR003793-01,U18TR003793,TR003793,Shannon Stott,Rapid Acceleration of Diagnostics - Radical (RADx-rad): Microfluidic Isolation and Characterization of SARS-CoV-2 and Virus Related Exosomes
14,RADx-UP,phs002651.v1.p1,3U01DA040381-05S1,U01DA040381,DA040381,Marianna Baum,Rapid Acceleration of Diagnostics - Underserved Populations(RADx-UP): Community Engaged Research on COVID-19 Testing Among Underserved and/or Vulnerable Populations
17,RADx-DHT,phs002533.v1.p1,75N91020C00038,75N91020C00038,75N91020C00038,Praduman Jain,Rapid Acceleration of Diagnostics - Digital Health (RADX-DH): Digital Health Solutions for COVID-19: COVID Community Action and Research Engagement (COVID-CARE)
18,RADx-DHT,phs002537.v1.p1,75N91020C00034,75N91020C00034,75N91020C00034,Ernesto Ramirez,Rapid Acceleration of Diagnostics - Digital Health (RADx-DH): COVID-19 Experience Study (C19EX) Survey
