# get_dataset_info
This notebook creates dataset-related node and relationship files for the RADx-KG:
* Dataset.csv
* Researcher-CREATED-Dataset.csv

In [1]:
import os
import pandas as pd
import utils
import publication_query

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [2]:
KG_PATH = "../kg/data"
DERIVED_DATA_PATH = "../derived_data"

In [3]:
START_FISCAL_YEAR = 2021 # starts Oct. 1, 2020

In [4]:
projects = pd.read_csv(os.path.join(DERIVED_DATA_PATH, "radx-projects.csv"), dtype=str, keep_default_na=False)
projects.query("researchInitiative == 'RADx-rad'", inplace=True)
#projects.query("radx_project == "RADx-UP"", inplace=True)

# Remove rows without a dbgap Accession number
projects = projects[projects["dbgapAccession"].str.startswith("phs")].copy()

In [5]:
# Add project URL
projects["url"] = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=" + projects["dbgapAccession"]
# Add CURIE for dbgap identifiers
projects["dbgapAccession"] = "dbgap:" + projects["dbgapAccession"]
# Remove prefix(Rapid Acceleration of Diagnostics - Radical (RADx-rad):) from study title
projects["studyTitle"] = projects["studyTitle"].apply(lambda x: x.split(":", maxsplit=1)[1])

In [6]:
print(f"Number of datasets: {projects.shape[0]}")
projects

Number of datasets: 48


Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject,url
0,RADx-rad,dbgap:phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Khalid Salaita,Rolosense: An Innovative Platform for Automatic Mobile Phone Readout of Active SARS-CoV-2,Automatic Detection & Tracing,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002744.v1.p1
12,RADx-rad,dbgap:phs002778.v1.p1,1U18TR003793-01,U18TR003793,TR003793,Shannon Stott,Microfluidic Isolation and Characterization of SARS-CoV-2 and Virus Related Exosomes,Exosome,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002778.v1.p1
26,RADx-rad,dbgap:phs002604.v1.p1,1U01DA053949-01,U01DA053949,DA053949,Anne-Catrin Uhlemann,Tracking the COVID-19 Epidemic in Sewage (TRACES),Wastewater,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002604.v1.p1
27,RADx-rad,dbgap:phs002583.v1.p1,5U01HL150852-02,U01HL150852,HL150852,Edward P DeMauro,A Rapid Breathalyzer Diagnostics Platform for COVID-19,Novel Biosensing and VOC,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002583.v1.p1
28,RADx-rad,dbgap:phs002603.v1.p1,3R33HD105594-03S1,R33HD105594,HD105594,Audrey Odom-John,Diagnosis of MIS-C in Febrile Children,PreVAIL kIds,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002603.v1.p1
29,RADx-rad,dbgap:phs002642.v1.p1,R01DC016112,R01DC016112,DC016112,Susan Travers,A Confectionary-based Screening Tool for Assessing Chemosensory Loss in COVID-19 Patients,Chemosensory Testing,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002642.v1.p1
30,RADx-rad,dbgap:phs002702.v1.p1,1R44DE030842-01,R44DE030842,DE030842,Jeffrey Ly,A Scalable Aptamer-based Electrochemical Biosensor For Rapid Detection of SARS-CoV-2 From Saliva,Novel Biosensing and VOC,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002702.v1.p1
31,RADx-rad,dbgap:phs002685.v1.p1,1R44DE030852-01,R44DE030852,DE030852,Xiaohu Yao,DNA Star SAS-CoV-2 Rapid Test,Novel Biosensing and VOC,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002685.v1.p1
32,RADx-rad,dbgap:phs003124.v1.p1,1U18TR003780-01,U18TR003780,TR003780,Samarjit Das,Exosome-based Non-traditional Technologies Towards Multi-Parametric and Integrated Approaches for SARS-CoV-2,Exosome,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs003124.v1.p1
33,RADx-rad,dbgap:phs002729.v1.p1,1U01DA053899-01,U01DA053899,DA053899,Rachel Noble,"Improved Scalability, Sensitivity, and Interpretability of Pathogen Detection, Including SARS-CoV-2, in Wastewater using High-Throughput, Highly Multiplexed Digital Array PCR Technology",Wastewater,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002729.v1.p1


### Create Dataset Nodes

In [7]:
dataset_map = {"dbgapAccession": "id", "studyTitle": "name", "url": "url"}

In [8]:
datasets = projects[dataset_map.keys()].copy()
datasets.rename(columns=dataset_map, inplace=True)
datasets.dropna(inplace=True)
datasets.drop_duplicates(inplace=True)

In [9]:
datasets.to_csv(os.path.join(KG_PATH, "nodes", "Dataset.csv"), index=False)

In [10]:
print(f"Number of datasets: {datasets.shape[0]}")
datasets

Number of datasets: 48


Unnamed: 0,id,name,url
0,dbgap:phs002744.v1.p1,Rolosense: An Innovative Platform for Automatic Mobile Phone Readout of Active SARS-CoV-2,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002744.v1.p1
12,dbgap:phs002778.v1.p1,Microfluidic Isolation and Characterization of SARS-CoV-2 and Virus Related Exosomes,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002778.v1.p1
26,dbgap:phs002604.v1.p1,Tracking the COVID-19 Epidemic in Sewage (TRACES),https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002604.v1.p1
27,dbgap:phs002583.v1.p1,A Rapid Breathalyzer Diagnostics Platform for COVID-19,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002583.v1.p1
28,dbgap:phs002603.v1.p1,Diagnosis of MIS-C in Febrile Children,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002603.v1.p1
29,dbgap:phs002642.v1.p1,A Confectionary-based Screening Tool for Assessing Chemosensory Loss in COVID-19 Patients,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002642.v1.p1
30,dbgap:phs002702.v1.p1,A Scalable Aptamer-based Electrochemical Biosensor For Rapid Detection of SARS-CoV-2 From Saliva,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002702.v1.p1
31,dbgap:phs002685.v1.p1,DNA Star SAS-CoV-2 Rapid Test,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002685.v1.p1
32,dbgap:phs003124.v1.p1,Exosome-based Non-traditional Technologies Towards Multi-Parametric and Integrated Approaches for SARS-CoV-2,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs003124.v1.p1
33,dbgap:phs002729.v1.p1,"Improved Scalability, Sensitivity, and Interpretability of Pathogen Detection, Including SARS-CoV-2, in Wastewater using High-Throughput, Highly Multiplexed Digital Array PCR Technology",https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs002729.v1.p1


In [11]:
deposited = projects[["dbgapAccession", "studyInvestigator"]].copy()

In [12]:
deposited

Unnamed: 0,dbgapAccession,studyInvestigator
0,dbgap:phs002744.v1.p1,Khalid Salaita
12,dbgap:phs002778.v1.p1,Shannon Stott
26,dbgap:phs002604.v1.p1,Anne-Catrin Uhlemann
27,dbgap:phs002583.v1.p1,Edward P DeMauro
28,dbgap:phs002603.v1.p1,Audrey Odom-John
29,dbgap:phs002642.v1.p1,Susan Travers
30,dbgap:phs002702.v1.p1,Jeffrey Ly
31,dbgap:phs002685.v1.p1,Xiaohu Yao
32,dbgap:phs003124.v1.p1,Samarjit Das
33,dbgap:phs002729.v1.p1,Rachel Noble


In [13]:
publication_query.expand_name_column(deposited, "studyInvestigator")
deposited["studyInvestigator"] = deposited["name"]
deposited = deposited[["dbgapAccession", "studyInvestigator"]].copy()

In [14]:
deposited

Unnamed: 0,dbgapAccession,studyInvestigator
0,dbgap:phs002744.v1.p1,Salaita K
12,dbgap:phs002778.v1.p1,Stott S
26,dbgap:phs002604.v1.p1,Uhlemann A
27,dbgap:phs002583.v1.p1,DeMauro EP
28,dbgap:phs002603.v1.p1,Odom-John A
29,dbgap:phs002642.v1.p1,Travers S
30,dbgap:phs002702.v1.p1,Ly J
31,dbgap:phs002685.v1.p1,Yao X
32,dbgap:phs003124.v1.p1,Das S
33,dbgap:phs002729.v1.p1,Noble R


In [15]:
investigators = pd.read_csv(os.path.join(KG_PATH, "nodes", "Researcher_investigators.csv"), dtype=str, keep_default_na=False)
print(f"Number of investigators: {investigators.shape[0]}")
investigators.head()

Number of investigators: 107


Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414
2,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851
3,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301
4,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619


In [16]:
deposited = utils.fuzzy_merge(deposited, investigators, left_fuzzy_on="studyInvestigator", right_fuzzy_on="name", how="left", threshold=0.90)
print(f"Number of study investigators: {deposited.shape[0]}")
deposited

Number of study investigators: 48


Unnamed: 0,dbgapAccession,studyInvestigator,match,score,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,dbgap:phs002744.v1.p1,Salaita K,Salaita KS,0.98,profileid:8668731,Salaita KS,Khalid S Salaita,Khalid,S,Salaita,orcid:0000-0003-4138-3477,profileid:8668731
1,dbgap:phs002778.v1.p1,Stott S,Stott SL,0.975,profileid:8956784,Stott SL,Shannon L Stott,Shannon,L,Stott,orcid:0000-0002-0349-0522,profileid:8956784
2,dbgap:phs002604.v1.p1,Uhlemann A,Uhlemann A,1.0,profileid:9733196,Uhlemann A,Anne-Catrin Uhlemann,Anne-Catrin,,Uhlemann,orcid:0000-0002-9798-4768,profileid:9733196
3,dbgap:phs002583.v1.p1,DeMauro EP,DeMauro EP,1.0,orcid:0000-0002-3793-8014,DeMauro EP,Edward P DeMauro,Edward,P,DeMauro,orcid:0000-0002-3793-8014,
4,dbgap:phs002603.v1.p1,Odom-John A,Odom John AR,0.908485,profileid:9198778,Odom John AR,Audrey Ragan Odom John,Audrey,Ragan,Odom John,,profileid:9198778
5,dbgap:phs002642.v1.p1,Travers S,Travers SP,0.98,profileid:1896926,Travers SP,Susan P Travers,Susan,P,Travers,orcid:0000-0001-8730-3618,profileid:1896926
6,dbgap:phs002702.v1.p1,Ly J,Ly J,1.0,profileid:78421272,Ly J,Jeffrey Ly,Jeffrey,,Ly,,profileid:78421272
7,dbgap:phs002685.v1.p1,Yao X,Yao X,1.0,profileid:77861758,Yao X,Xiaohu Yao,Xiaohu,,Yao,,profileid:77861758
8,dbgap:phs003124.v1.p1,Das S,Das S,1.0,profileid:10349485,Das S,Samarjit Das,Samarjit,,Das,,profileid:10349485
9,dbgap:phs002729.v1.p1,Noble R,Noble RT,0.975,profileid:10129440,Noble RT,Rachel Todd Noble,Rachel,Todd,Noble,orcid:0000-0001-9071-8312,profileid:10129440


### Create Researcher-CREATED-Dataset relationships

In [17]:
deposited_map = {"id": "from", "dbgapAccession": "to"}
deposited = utils.rename_and_reorder_columns(deposited, deposited_map)
deposited.drop_duplicates(inplace=True)

In [18]:
deposited.to_csv(os.path.join(KG_PATH, "relationships", "Researcher-CREATED-Dataset.csv"), index=False)

In [19]:
print(f"Number of Researcher-CREATED-Dataset relationships: {deposited.shape[0]}")
deposited

Number of Researcher-CREATED-Dataset relationships: 48


Unnamed: 0,from,to
0,profileid:8668731,dbgap:phs002744.v1.p1
1,profileid:8956784,dbgap:phs002778.v1.p1
2,profileid:9733196,dbgap:phs002604.v1.p1
3,orcid:0000-0002-3793-8014,dbgap:phs002583.v1.p1
4,profileid:9198778,dbgap:phs002603.v1.p1
5,profileid:1896926,dbgap:phs002642.v1.p1
6,profileid:78421272,dbgap:phs002702.v1.p1
7,profileid:77861758,dbgap:phs002685.v1.p1
8,profileid:10349485,dbgap:phs003124.v1.p1
9,profileid:10129440,dbgap:phs002729.v1.p1
