# clean-project-info
This notebook extracts and cleans RADx project, dbGaP accession, PI, and funding source from a file provided by dbGaP on Aug. 24, 2023.

In [1]:
import grant_query
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
projects = pd.read_csv("../data/RADx_Registered_Datasets 08242023-vow.csv", low_memory=False)

### Use only the relevant fields

In [3]:
projects["long_study_name"] = projects["Study Name"]
projects["dbgap_accession"] = projects["Accession"]
projects["long_study_name"] = projects["Study Name"]
projects["nih_funding"] = projects["NIH funding"]
projects["orig_principal_investigator"] = projects["PI"]
projects["orig_project_number"] = projects["Grant ID"]

### Clean up data

In [6]:
# extract subfields
projects["orig_radx_project"] = projects["long_study_name"].str.extract("\((.*?)\)", expand=True)
projects["study_title"] = projects["long_study_name"].str.extract(":\s*(.*)", expand=True)

In [7]:
# remove secondary dbgap_accession duplicates
projects.query("dbgap_accession != 'phs002574.v1.p1'", inplace=True)
projects.query("dbgap_accession != 'phs002516.v1.p1'", inplace=True)

In [8]:
# expand one-to-many relationship for project number
projects["project_num"] = projects["orig_project_number"].str.split("&")
projects = projects.explode("project_num")
projects["project_num"] = projects["project_num"].str.replace(" ", "", regex=False)
projects["project_num"] = projects["project_num"].str.replace(r'^\W*', '', regex=True) # remove non-printable whitespace

In [9]:
# fix inconsistencies and typos
projects["project_num"] = projects["project_num"].str.replace("N75N91020C00040", "75N91020C00040", regex=False)
projects["project_num"] = projects["project_num"].str.replace("75N91020C0034", "75N91020C00034", regex=False)
projects["project_num"] = projects["project_num"].str.replace("4U54HL119145-08subawardno.124778", "4U54HL119145-08", regex=False)
projects["project_num"] = projects["project_num"].str.replace("3-U-1HL152401-02S1", "3U01HL152401-02S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("R01-HL151292-01S1", "R01HL151292-01S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("R01-MD012767-04S1", "R01MD012767-04S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("OTsHD108105-01", "OT2HD108105-01", regex=False)
projects["project_num"] = projects["project_num"].str.replace("U-1HL152401-02S1", "3U01HL152401-02S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("3P#0DA011041-23S1", "3P30DA011041-23S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("903-9015Z", "3UG1DA050071-04S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("3Ro1MD013852-20S3", "3R01MD013852-03S2", regex=False)
projects["project_num"] = projects["project_num"].str.replace("RO1DC016112", "R01DC016112", regex=False)
projects["project_num"] = projects["project_num"].str.replace("U01-MD017436", "U01MD017436", regex=False)
projects["project_num"] = projects["project_num"].str.replace("MISSING", "5U01MD017437-02", regex=False)
projects["project_num"] = projects["project_num"].str.replace("R61DH105594", "3R33HD105594-03S1", regex=False)
projects["project_num"] = projects["project_num"].str.replace("RO1DC016112", "R01DC016112", regex=False)
projects["project_num"] = projects["project_num"].str.replace("1RO1MD016526-01", "1R01MD016526-01", regex=False)
projects["project_num"] = projects["project_num"].str.replace("1RO1NR020105-01", "1R01NR020105", regex=False)
projects["project_num"] = projects["project_num"].str.replace("Z01-ES100475", "1ZIAES103366-02", regex=False) # updated grant number for D. Bell
#projects["project_num"] = projects["project_num"].str.replace("^(\d)-(?=\w)","", regex=True) # remove a hyphen after the first digit and return the modified string, while keeping strings that don't start with a digit unchanged,
projects["project_num"] = projects["project_num"].str.replace(r"^(\d)-",r"\1", regex=True) # remove a hyphen after the first digit and return the modified string, while keeping strings that don't start with a digit u

In [10]:
# add CDDCs
# 7U24LM013755-03 Lucila Ohno-Machado (RADx-Rad Discoveries & Data: Consortium Coordination Center Program Organization)
# 1U24MD016258-01 Michael Cohen-Wolkowiez (RADx-UP CDCC)

In [11]:
projects["principal_investigator"] = projects["orig_principal_investigator"].apply(grant_query.standardize_full_name)

MacKensie


In [12]:
projects.to_csv("../data/radx-all-projects.csv", index=False)

In [13]:
projects["core_project_num"] = projects["project_num"].str.replace("75N9", "special_75N9", regex=False) # protect this project number from the next step
projects["core_project_num"] = projects["core_project_num"].str.replace("^\d+-", "", regex=True) # remove leading digit and hyphen
projects["core_project_num"] = projects["core_project_num"].str.replace("^\d?", "", regex=True) # remove leading digit
projects["core_project_num"] = projects["core_project_num"].str.replace("special_75N9", "75N9", regex=False) # remove the protection
projects["core_project_num"] = projects["core_project_num"].str.extract("^([^-\n]+)", expand=True) # remove hyphen-separated postfix

In [14]:
projects["project_serial_num"] = projects["core_project_num"].str[-8:]

In [15]:
# fix radx_project
projects["radx_project"] = projects["orig_radx_project"]
projects["radx_project"] = projects["radx_project"].str.replace("RADX-DH", "RADx-DHT", regex=True)
projects["radx_project"] = projects["radx_project"].str.replace("RADx$", "RADx-UP", regex=True)
projects["radx_project"] = projects["radx_project"].str.replace("RADx - UP", "RADx-UP", regex=False)
projects["radx_project"] = projects["radx_project"].str.replace("RADx-DH", "RADx-DHT", regex=False)

In [16]:
projects["project_num_typo"] = projects["orig_project_number"] != projects["project_num"]

In [17]:
projects.columns

Index(['ID', 'Study Name', 'Accession', 'Admin. IC', 'Primary GPA',
       'NIH funding', 'External Data Source', 'External Data Source Name',
       'PI', 'Grant ID', 'DAC', 'Study level access',
       'Submission certification', 'Association Analysis', 'Documents',
       'Images', 'Links to public NCBI databases', 'Molecular (not NGC)',
       'Other study date type', 'Phenotype', 'Sequence',
       'Subjects and Samples', 'SRA', 'SRA to NCBI', 'SRA to Google Cloud',
       'SRA to Amazon Cloud', 'Target data delivery date',
       'Target public release date', 'Study participants',
       'Date of initial contact', 'Preview link', 'Comments',
       'Research statement', 'Public summary', 'DAR appendices',
       'Publication embargo interval', 'Period of release',
       'Years until renewal', 'Weeks to cancel request', 'Missing Items',
       'Missing SRA accession', 'Registration date', 'GSR access',
       'Universal DUC', 'Pheno Curator', 'Geno Curator', 'long_study_name',
  

In [18]:
projects[["ID", "Study Name", "orig_radx_project", "radx_project", "Accession", "PI", "principal_investigator", "Grant ID", "project_num", "project_num_typo", "core_project_num"]].to_csv("../data/RADx_Registered_Datasets_updates_08242023.csv", index=False)

In [20]:
projects

Unnamed: 0,ID,Study Name,Accession,Admin. IC,Primary GPA,NIH funding,External Data Source,External Data Source Name,PI,Grant ID,DAC,Study level access,Submission certification,Association Analysis,Documents,Images,Links to public NCBI databases,Molecular (not NGC),Other study date type,Phenotype,Sequence,Subjects and Samples,SRA,SRA to NCBI,SRA to Google Cloud,SRA to Amazon Cloud,Target data delivery date,Target public release date,Study participants,Date of initial contact,Preview link,Comments,Research statement,Public summary,DAR appendices,Publication embargo interval,Period of release,Years until renewal,Weeks to cancel request,Missing Items,Missing SRA accession,Registration date,GSR access,Universal DUC,Pheno Curator,Geno Curator,long_study_name,dbgap_accession,nih_funding,orig_principal_investigator,orig_project_number,orig_radx_project,study_title,project_num,principal_investigator,core_project_num,project_serial_num,radx_project,project_num_typo
0,45548,Rapid Acceleration of Diagnostics - Radical (R...,phs002744.v1.p1,OD,Vivian Ota Wang,"OD, NIAAA",Yes,RADx Data Hub,Khalid Salaita,U01AA029345,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,12/1/21,,0.0,,,,Display,Display,,none,,1,8,,No,12/16/21,GSR does not apply (ONLY for studies that will...,No,Lee Kapp,Zhen Wang,Rapid Acceleration of Diagnostics - Radical (R...,phs002744.v1.p1,"OD, NIAAA",Khalid Salaita,U01AA029345,RADx-rad,Rolosense: An Innovative Platform for Automati...,U01AA029345,Khalid Salaita,U01AA029345,AA029345,RADx-rad,False
1,45882,Rapid Acceleration of Diagnostics - Underserve...,phs002775.v1.p1,OD,Vivian Ota Wang,"OD, NIGMS",Yes,RADx Data Hub,Sharon Rounds,U54GM115677,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,12/1/21,,100000.0,,,,Display,Display,,none,,1,8,,No,1/19/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002775.v1.p1,"OD, NIGMS",Sharon Rounds,U54GM115677,RADx-UP,Developing a Realtime Monitoring System and Pr...,U54GM115677,Sharon Rounds,U54GM115677,GM115677,RADx-UP,False
2,45790,Rapid Acceleration of Diagnostics - Underserve...,phs002761.v1.p1,OD,Vivian Ota Wang,"OD, NCATS",Yes,RADx Data Hub,David McPherson,3UL1TR003167 - 02S1 & 3UL1TR003167 - 03S3,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,8/31/21,,980.0,,,,Display,Display,,none,,1,8,,No,1/10/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002761.v1.p1,"OD, NCATS",David McPherson,3UL1TR003167 - 02S1 & 3UL1TR003167 - 03S3,RADx-UP,Addressing COVID-19 Testing Disparities in Vul...,3UL1TR003167-02S1,David McPherson,UL1TR003167,TR003167,RADx-UP,True
2,45790,Rapid Acceleration of Diagnostics - Underserve...,phs002761.v1.p1,OD,Vivian Ota Wang,"OD, NCATS",Yes,RADx Data Hub,David McPherson,3UL1TR003167 - 02S1 & 3UL1TR003167 - 03S3,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,8/31/21,,980.0,,,,Display,Display,,none,,1,8,,No,1/10/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002761.v1.p1,"OD, NCATS",David McPherson,3UL1TR003167 - 02S1 & 3UL1TR003167 - 03S3,RADx-UP,Addressing COVID-19 Testing Disparities in Vul...,3UL1TR003167-03S3,David McPherson,UL1TR003167,TR003167,RADx-UP,True
3,45776,Rapid Acceleration of Diagnostics - Underserve...,phs002759.v1.p1,OD,Vivian Ota Wang,"OD, NICHD",Yes,RADx Data Hub,Jason Newland,1T OT2 HD107557,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,1/15/22,,6200.0,,,,Display,Display,,none,,1,8,,No,1/10/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002759.v1.p1,"OD, NICHD",Jason Newland,1T OT2 HD107557,RADx-UP,Assessing Testing Strategies for Safe Return t...,1TOT2HD107557,Jason Newland,TOT2HD107557,HD107557,RADx-UP,True
4,46856,Rapid Acceleration of Diagnostics - Underserve...,phs002875.v1.p1,OD,Vivian Ota Wang,"OD, NIMHD",Yes,RADx Data Hub,Sara Al-Dahir,3U54MD007595-12S4,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,8/17/21,,487.0,,,,Display,Display,,none,,1,8,,No,4/6/22,GSR does not apply (ONLY for studies that will...,No,Lee Kapp,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002875.v1.p1,"OD, NIMHD",Sara Al-Dahir,3U54MD007595-12S4,RADx-UP,Assessing Vaccine Hesitancy and a Pharmacist L...,3U54MD007595-12S4,Sara Al-Dahir,U54MD007595,MD007595,RADx-UP,False
5,46051,Rapid Acceleration of Diagnostics - Underserve...,phs002800.v1.p1,OD,Vivian Ota Wang,"OD, NICHD",Yes,RADx Data Hub,JOHN FOXE,1 OT2 HD107553-01,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,10/22/21,,450.0,,,,Display,Display,,none,,1,8,,No,2/2/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002800.v1.p1,"OD, NICHD",JOHN FOXE,1 OT2 HD107553-01,RADx-UP,COV-IDD: Testing for COVID-19 in High Risk Chi...,1OT2HD107553-01,John Foxe,OT2HD107553,HD107553,RADx-UP,True
6,45783,Rapid Acceleration of Diagnostics - Underserve...,phs002760.v1.p1,OD,Vivian Ota Wang,OD,Yes,RADx Data Hub,ROBERT GROSS,R01-HL151292-01S1,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,11/1/21,,1000.0,,,,Display,Display,,none,,1,8,,No,1/10/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002760.v1.p1,OD,ROBERT GROSS,R01-HL151292-01S1,RADx-UP,COVID Self-Testing Through Rapid Network Distr...,R01HL151292-01S1,Robert Gross,R01HL151292,HL151292,RADx-UP,True
7,46812,Rapid Acceleration of Diagnostics - Underserve...,phs002869.v1.p1,OD,Vivian Ota Wang,"OD, NCI",Yes,RADx Data Hub,Martinez Maria,3UH3CA233314-02S1,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,11/1/21,,6000.0,,,,Display,Display,,none,,1,8,,No,4/6/22,GSR does not apply (ONLY for studies that will...,No,Wenyu Wu,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002869.v1.p1,"OD, NCI",Martinez Maria,3UH3CA233314-02S1,RADx-UP,COVID-19 Testing in Underserved and Vulnerable...,3UH3CA233314-02S1,Martinez Maria,UH3CA233314,CA233314,RADx-UP,False
8,46924,Rapid Acceleration of Diagnostics - Underserve...,phs002878.v1.p1,OD,Vivian Ota Wang,"OD, NICHD",Yes,RADx Data Hub,SUSAN KIENE,1OT2HD108112-01,RADx;,Controlled Access,Submission Certification verified by IC,,,,,,,,,,No,,,,1/18/22,,9000.0,,,,Display,Display,,none,,1,8,,No,4/11/22,GSR does not apply (ONLY for studies that will...,No,Lee Kapp,Zhen Wang,Rapid Acceleration of Diagnostics - Underserve...,phs002878.v1.p1,"OD, NICHD",SUSAN KIENE,1OT2HD108112-01,RADx-UP,Communities Fighting COVID!: Returning Our Kid...,1OT2HD108112-01,Susan Kiene,OT2HD108112,HD108112,RADx-UP,False


In [21]:
projects.sort_values(["radx_project", "dbgap_accession"], inplace=True)

In [22]:
project_core_data = projects[["radx_project", "dbgap_accession", "project_num", "core_project_num", "project_serial_num", "principal_investigator", "study_title"]]

In [23]:
project_core_data.query("radx_project == 'RADx-rad'")

Unnamed: 0,radx_project,dbgap_accession,project_num,core_project_num,project_serial_num,principal_investigator,study_title
71,RADx-rad,PENDING TRIBAL DATA TRANSFER AND USE AGREEMENT...,1U01DA053976-01,U01DA053976,DA053976,Otakuye Conroy-Ben,Wastewater Analysis of SARS-CoV-2 in Tribal Co...
57,RADx-rad,phs002522.v1.p1,U01HL152410,U01HL152410,HL152410,Jie Huang,MOF-SCENT: Metal-organic Frameworks for Screen...
60,RADx-rad,phs002523.v1.p1,1R01NR020105,R01NR020105,NR020105,Michael Snyder,Multi-Modal Wireless COVID Monitoring & Infect...
69,RADx-rad,phs002524.v1.p1,R01DE031114,R01DE031114,DE031114,Jesse Jokerst,Validation of Smart Masks for Surveillance of ...
66,RADx-rad,phs002525.v1.p1,1U01DA053941-01,U01DA053941,DA053941,Helena Solo-Gabriele,SF-RAD: Development and Proof-of-Concept Imple...
45,RADx-rad,phs002527.v1.p1,U01LM013129,U01LM013129,LM013129,Matthew Scotch,Bioinformatics Framework for Wastewater-based ...
72,RADx-rad,phs002542.v1.p1,U01DA053903,U01DA053903,DA053903,James Keck,Wastewater Assessment for Coronavirus in Kentu...
59,RADx-rad,phs002543.v1.p1,1U01AA029324-01,U01AA029324,AA029324,Radislav Potyrailo,Minimal False-Alarm Touch-Based Detection of S...
43,RADx-rad,phs002544.v1.p1,U18TR003778,U18TR003778,TR003778,David Wong,AFS/SERS Saliva-based SARS-CoV-2 Earliest Infe...
51,RADx-rad,phs002546.v1.p1,1U01AA029328,U01AA029328,AA029328,Diana Vanegas,Development of an Automated Diagnostic Platfor...


In [24]:
project_core_data.to_csv("../data/RADx_Registered_Datasets_08242023.csv", index=False)