# get_core_project_data
This notebook retrieves up-to-date and manually cleaned up RADx core data obtained from the dbGaP database dump received on 2023-08-24.
All other notebooks in this repository depend on the data file generated by this notebook: "../data/radx-projects.csv"

In [1]:
import os
import grant_query
import publication_query
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
DERIVED_DATA_PATH = "../derived_data"
os.makedirs(DERIVED_DATA_PATH, exist_ok=True)

In [3]:
projects = pd.read_csv(
        "https://docs.google.com/spreadsheets/d/1ZYUmUJEED3X6Mdo_sTMzOtIRW8IjsqWkAeLBKWdb31k/export?format=csv",
        keep_default_na=False,
       )

In [4]:
projects["project_serial_num"] = projects["core_project_num"].apply(lambda x: grant_query.extract_project_serial_num(x))

## Rename, select, and reorder columns

In [5]:
projects_map = {"updated_radx_project": "researchInitiative", "Accession updated": "dbgapAccession",
                "Grant ID updated": "projectNum", "core_project_num": "coreProjectNum", 
                "project_serial_num": "projectSerialNum", "PI updated": "studyInvestigator", "Study Name": "studyTitle", "sub_project": "subProject"}
projects = publication_query.rename_and_reorder_columns(projects, projects_map)

In [6]:
projects.head()

Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (R...,Automatic Detection & Tracing
1,RADx-UP,phs002775.v1.p1,U54GM115677,U54GM115677,GM115677,Sharon Rounds,Rapid Acceleration of Diagnostics - Underserve...,
2,RADx-UP,phs002761.v1.p1,3UL1TR003167-02S1,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
3,RADx-UP,phs002761.v1.p1,3UL1TR003167-03S3,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserve...,


In [7]:
# remove dbgap_accession duplicates
projects.query("dbgapAccession != 'phs002574.v1.p1'", inplace=True)
projects.query("dbgapAccession != 'phs002516.v1.p1'", inplace=True)

In [8]:
projects.to_csv(os.path.join(DERIVED_DATA_PATH, "radx-projects.csv"), index=False)

In [9]:
print(projects.shape[0])
projects.head()

182


Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (R...,Automatic Detection & Tracing
1,RADx-UP,phs002775.v1.p1,U54GM115677,U54GM115677,GM115677,Sharon Rounds,Rapid Acceleration of Diagnostics - Underserve...,
2,RADx-UP,phs002761.v1.p1,3UL1TR003167-02S1,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
3,RADx-UP,phs002761.v1.p1,3UL1TR003167-03S3,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserve...,


In [10]:
radx_rad = projects.query("researchInitiative == 'RADx-rad'").copy()
radx_rad["PHS Number"] = radx_rad["dbgapAccession"].str.removesuffix(".v1.p1")
radx_rad = radx_rad[["PHS Number", "studyInvestigator", "projectNum"]].copy()
print(radx_rad.shape[0])
radx_rad

50


Unnamed: 0,PHS Number,studyInvestigator,projectNum
0,phs002744,Khalid Salaita,U01AA029345
12,phs002778,Shannon Stott,1U18TR003793-01
27,phs002604,Anne-Catrin Uhlemann,1U01DA053949-01
28,phs002583,Edward P DeMauro,5U01HL150852-02
29,phs002603,Audrey Odom-John,3R33HD105594-03S1
30,phs002642,Susan Travers,R01DC016112
31,phs002702,Jeffrey Ly,1R44DE030842-01
32,phs002685,Xiaohu Yao,1R44DE030852-01
33,phs003124,Samarjit Das,1U18TR003780-01
34,phs002729,Rachel Noble,1U01DA053899-01


In [14]:
# RADx Data Submission Project Tracker - RADx-rad sheet
#doc_id = "1zBXU_bSPZBJuUFyzmmPgGTqSR0Xdk3IrQOGZ8tv5k-M"
#grid_id = "852835732"

# 2024-03-04:
#doc_id = "1vgCyfi-9Pvlm5-yJdILxws6b1FRtnegO"
#grid_id = "1628495610"
#df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid={grid_id}", keep_default_na=False)
df = pd.read_excel("/Users/Peter/work/RADx-rad/RADx-rad Funding Numbers and PIs_2024-03-04_pr.xlsx")
print(df.shape[0])
df.head()

48


Unnamed: 0,DCC,phs,NIH RePORTER Grant Number,NIH RePORTER Contact PI/Project Leader,Data_Owner,Co-Investigators (from RePORTER),Additional PI from RADx-rad Website,Notes,Peter's Notes
0,RADx-rad,phs002522,1-U01HL152410-01,"FAY, WILLIAM P","Huang, Jie","GRANT, SHEILA ANN; TURPIN, WILLIAM MONROE",,,
1,RADx-rad,phs002523,1R01NR020105-01,"SNYDER, MICHAEL P.","SNYDER, MICHAEL P",,,,
2,RADx-rad,phs002524,1R01DE031114-01,"JOKERST, JESSE VINCENT","JOKERST, JESSE VINCENT",,,,
3,RADx-rad,phs002525,1U01DA053941-01,"SOLO-GABRIELE, HELENA; MASON, CHRISTOPHER EDWA...","SOLO-GABRIELE, HELENA","MASON, CHRISTOPHER EDWARD; SCHURER STEPHAN C","Vidovic, Dusica",,
4,RADx-rad,phs002527,3U01LM013129-02S1,"SCOTCH, MATTHEW;\nHALDEN, ROLF U;\nVARSANI, AR...","SCOTCH, MATTHEW","HALDEN, ROLF U;\nVARSANI, ARVIND",,different supplement year on website (S2 versu...,


In [15]:
df_merge = df.merge(radx_rad, left_on="phs", right_on="PHS Number", how="outer")
df_merge.fillna("", inplace=True)
print(df_merge.shape[0])
df_merge

50


Unnamed: 0,DCC,phs,NIH RePORTER Grant Number,NIH RePORTER Contact PI/Project Leader,Data_Owner,Co-Investigators (from RePORTER),Additional PI from RADx-rad Website,Notes,Peter's Notes,PHS Number,studyInvestigator,projectNum
0,,,,,,,,,,,Lucila Ohno-Machado,1U24LM013755-01
1,,,,,,,,,,PENDING TRIBAL DATA TRANSFER AND USE AGREEMENT...,Otakuye Conroy-Ben,1U01DA053976-01
2,RADx-rad,phs002522,1-U01HL152410-01,"FAY, WILLIAM P","Huang, Jie","GRANT, SHEILA ANN; TURPIN, WILLIAM MONROE",,,,phs002522,Jie Huang,U01HL152410
3,RADx-rad,phs002523,1R01NR020105-01,"SNYDER, MICHAEL P.","SNYDER, MICHAEL P",,,,,phs002523,Michael Snyder,1R01NR020105
4,RADx-rad,phs002524,1R01DE031114-01,"JOKERST, JESSE VINCENT","JOKERST, JESSE VINCENT",,,,,phs002524,Jesse Jokerst,R01DE031114
5,RADx-rad,phs002525,1U01DA053941-01,"SOLO-GABRIELE, HELENA; MASON, CHRISTOPHER EDWA...","SOLO-GABRIELE, HELENA","MASON, CHRISTOPHER EDWARD; SCHURER STEPHAN C","Vidovic, Dusica",,,phs002525,Helena Solo-Gabriele,1U01DA053941-01
6,RADx-rad,phs002527,3U01LM013129-02S1,"SCOTCH, MATTHEW;\nHALDEN, ROLF U;\nVARSANI, AR...","SCOTCH, MATTHEW","HALDEN, ROLF U;\nVARSANI, ARVIND",,different supplement year on website (S2 versu...,,phs002527,Matthew Scotch,U01LM013129
7,RADx-rad,phs002542,1U01DA053903-01,"BERRY, SCOTT M","Keck, James","BERRY, SCOTT M",,,,phs002542,James W Keck,U01DA053903
8,RADx-rad,phs002543,1U01AA029324-01,"POTYRAILO, RADISLAV A","POTYRAILO, RADISLAV A",,,,,phs002543,Radislav Potyrailo,1U01AA029324-01
9,RADx-rad,phs002544,1U18TR003778-01,"WONG, DAVID T; HUANG, TONY JUN; KIM YONG; XIE,...","WONG, DAVID T","HUANG, TONY JUN; KIM YONG; XIE, YA-HONG",,,,phs002544,David Wong,U18TR003778
