# get_core_project_data
This notebook retrieves up-to-date and manually cleaned up RADx core data obtained from the dbGaP database dump received on 2023-08-24.
All other notebooks in this repository depend on the data file generated by this notebook: "../data/radx-projects.csv"

In [1]:
import os
import grant_query
import publication_query
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
DERIVED_DATA_PATH = "../derived_data"

In [3]:
projects = pd.read_csv(
        "https://docs.google.com/spreadsheets/d/1ZYUmUJEED3X6Mdo_sTMzOtIRW8IjsqWkAeLBKWdb31k/export?format=csv",
        keep_default_na=False,
       )

In [4]:
projects["project_serial_num"] = projects["core_project_num"].apply(lambda x: grant_query.extract_project_serial_num(x))

## Rename, select, and reorder columns

In [5]:
projects_map = {"updated_radx_project": "researchInitiative", "Accession updated": "dbgapAccession",
                "Grant ID updated": "projectNum", "core_project_num": "coreProjectNum", 
                "project_serial_num": "projectSerialNum", "PI updated": "studyInvestigator", "Study Name": "studyTitle", "sub_project": "subProject"}
projects = publication_query.rename_and_reorder_columns(projects, projects_map)

In [6]:
projects.head()

Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (R...,Automatic Detection & Tracing
1,RADx-UP,phs002775.v1.p1,U54GM115677,U54GM115677,GM115677,Sharon Rounds,Rapid Acceleration of Diagnostics - Underserve...,
2,RADx-UP,phs002761.v1.p1,3UL1TR003167-02S1,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
3,RADx-UP,phs002761.v1.p1,3UL1TR003167-03S3,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserve...,


In [7]:
# remove dbgap_accession duplicates
projects.query("dbgapAccession != 'phs002574.v1.p1'", inplace=True)
projects.query("dbgapAccession != 'phs002516.v1.p1'", inplace=True)

In [8]:
projects.to_csv(os.path.join(DERIVED_DATA_PATH, "radx-projects.csv"), index=False)

In [9]:
print(projects.shape[0])
projects.head()

182


Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rapid Acceleration of Diagnostics - Radical (R...,Automatic Detection & Tracing
1,RADx-UP,phs002775.v1.p1,U54GM115677,U54GM115677,GM115677,Sharon Rounds,Rapid Acceleration of Diagnostics - Underserve...,
2,RADx-UP,phs002761.v1.p1,3UL1TR003167-02S1,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
3,RADx-UP,phs002761.v1.p1,3UL1TR003167-03S3,UL1TR003167,TR003167,David McPherson,Rapid Acceleration of Diagnostics - Underserve...,
4,RADx-UP,phs002759.v1.p1,1OT2HD107557,OT2HD107557,HD107557,Jason Newland,Rapid Acceleration of Diagnostics - Underserve...,


In [10]:
radx_rad = projects.query("researchInitiative == 'RADx-rad'").copy()
radx_rad["PHS Number"] = radx_rad["dbgapAccession"].str.removesuffix(".v1.p1")
radx_rad = radx_rad[["PHS Number", "studyInvestigator", "projectNum"]].copy()
print(radx_rad.shape[0])
radx_rad

50


Unnamed: 0,PHS Number,studyInvestigator,projectNum
0,phs002744,Khalid Salaita,U01AA029345
12,phs002778,Shannon Stott,1U18TR003793-01
27,phs002604,Anne-Catrin Uhlemann,1U01DA053949-01
28,phs002583,Edward P DeMauro,5U01HL150852-02
29,phs002603,Audrey Odom-John,3R33HD105594-03S1
30,phs002642,Susan Travers,R01DC016112
31,phs002702,Jeffrey Ly,1R44DE030842-01
32,phs002685,Xiaohu Yao,1R44DE030852-01
33,phs003124,Samarjit Das,1U18TR003780-01
34,phs002729,Rachel Noble,1U01DA053899-01


In [11]:
# RADx Data Submission Project Tracker - RADx-rad sheet
doc_id = "1zBXU_bSPZBJuUFyzmmPgGTqSR0Xdk3IrQOGZ8tv5k-M"
grid_id = "852835732"
df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid={grid_id}", skiprows=1, keep_default_na=False)
df = df[["PHS Number", "Principal Investigator (PI) Name", "Internal Program Project Number"]].copy()
print(df.shape[0])
df

49


Unnamed: 0,PHS Number,Principal Investigator (PI) Name,Internal Program Project Number
0,phs002642,"TRAVERS, SUSAN P",3R01DC016112-04S1
1,phs002527,"SCOTCH, MATTHEW",3R01LM013129-02W1
2,phs002700,"CIRRITO, JOHN R",1U01AA029331-01
3,phs002573,"PENG, LU",1U01AA029348-01
4,phs002744,"SALAITA, KHALID S",1U01AA029345-01
5,phs002543,"POTYRAILO, RADISLAV A",1U01AA029324-01
6,phs002565,"PUN, SUZIE H.",1U01AA029316-01
7,phs002546,"VANEGAS, DIANA",1U01AA029328-01
8,phs002525,"SOLO-GABRIELE, HELENA",1U01DA053941-01
9,,"CONROY-BEN, OTAKUYE",1U01DA053976-01


In [12]:
df_merge = df.merge(radx_rad, on="PHS Number", how="outer")
df_merge.fillna("", inplace=True)
print(df_merge.shape[0])
df_merge

50


Unnamed: 0,PHS Number,Principal Investigator (PI) Name,Internal Program Project Number,studyInvestigator,projectNum
0,phs002642,"TRAVERS, SUSAN P",3R01DC016112-04S1,Susan Travers,R01DC016112
1,phs002527,"SCOTCH, MATTHEW",3R01LM013129-02W1,Matthew Scotch,U01LM013129
2,phs002700,"CIRRITO, JOHN R",1U01AA029331-01,John Cirrito,U01AA029331
3,phs002573,"PENG, LU",1U01AA029348-01,Lu Peng,1U01AA029348-01
4,phs002744,"SALAITA, KHALID S",1U01AA029345-01,Khalid Salaita,U01AA029345
5,phs002543,"POTYRAILO, RADISLAV A",1U01AA029324-01,Radislav Potyrailo,1U01AA029324-01
6,phs002565,"PUN, SUZIE H.",1U01AA029316-01,Suzie Pun,1U01AA029316
7,phs002546,"VANEGAS, DIANA",1U01AA029328-01,Diana Vanegas,1U01AA029328
8,phs002525,"SOLO-GABRIELE, HELENA",1U01DA053941-01,Helena Solo-Gabriele,1U01DA053941-01
9,,"CONROY-BEN, OTAKUYE",1U01DA053976-01,Lucila Ohno-Machado,1U24LM013755-01
