# Extract Data Elements from the RADx-rad Studies

This notebook collects the data elements (variable/field names and their descriptions) from the downloaded RADx-rad studies.
    
It assigns data elements to the following tiers:
* Tier1: Minimum Common Data Elements (patient demographics, symptoms, medical history).
* Tier2: Harmonized Data Elements developed for RADx-rad diagnostic method development projects.
* Tier3: Study-specific, non-harmonized Data Elements.

It also assigns a RADx-rad subproject name, using the information from the metadata files.

Author: Peter W Rose, UC San Diego (pwrose.ucsd@gmail.com)

Data last updated: 2025-06-09

In [1]:
import os
from pathlib import Path
import pandas as pd
import utils

In [2]:
# The dataset for each study is in a `phsxxxxxx` directory, corresponding to the dbGaP accession number of the study.
data_path = "../../phs*/"

# Results directory
result_dir = "../../results"

# RADx-rad data elements
tier1_harmonized_dict = "https://github.com/radxrad/common-data-elements/raw/refs/heads/main/cdes/RADx-rad_tier1_dict_2025-03-19.csv"
tier2_harmonized_dict = "https://github.com/radxrad/common-data-elements/raw/refs/heads/main/cdes/RADx-rad_tier2_dict_2025-03-19.csv"

In [3]:
def get_data_elements(dict_file_path):
    df_list = []
    for dir_path in sorted(Path().glob(dict_file_path)):
        dict_files = list(dir_path.glob("*DICT_origcopy*.csv"))
        print(f"{dir_path}: Number of dictionary files: {len(dict_files)}")

        for dict_file in dict_files:
            df = pd.read_csv(str(dict_file), dtype=str, keep_default_na=False)

            # derive prefix
            prefix = dict_file.name.split("_DICT_origcopy")[0]

            # find the corresponding metadata file
            meta_candidates = list(dir_path.glob(f"{prefix}_META_origcopy*.json"))
            if not meta_candidates:
                raise FileNotFoundError(f"No META JSON for {dict_file.name}")
            meta_file = meta_candidates[0]
            meta_path = str(meta_file)

            # find the corresponding data file
            data_candidates = list(dir_path.glob(f"{prefix}_DATA_origcopy*.csv"))
            if not data_candidates:
                raise FileNotFoundError(f"No DATA CSV for {dict_file.name}")
            data_file = data_candidates[0]
            data_path = str(data_file)

            # load the metadata
            subproject, phs_id, project_num, _ = utils.extract_fields_from_metadata(meta_path)
            radx_id = utils.extract_radx_id(meta_path)   # ← and here

            df = df.assign(
                subproject=subproject,
                phs_id=phs_id,
                study_id=dir_path.name,
                project_num=project_num,
                radx_id=radx_id,
                filename=data_path
            )
            df_list.append(df)

    if not df_list:
        return pd.DataFrame()

    data = pd.concat(df_list, ignore_index=True).drop_duplicates()
    return utils.assign_data_element_tier(data, tier1_harmonized_dict, tier2_harmonized_dict)

In [4]:
dicts = get_data_elements(data_path)
dicts.head()

../../phs002522: Number of dictionary files: 67
../../phs002523: Number of dictionary files: 951
../../phs002524: Number of dictionary files: 6
../../phs002525: Number of dictionary files: 13
../../phs002527: Number of dictionary files: 33
../../phs002542: Number of dictionary files: 1
../../phs002543: Number of dictionary files: 1
../../phs002544: Number of dictionary files: 38
../../phs002546: Number of dictionary files: 5
../../phs002549: Number of dictionary files: 14
../../phs002550: Number of dictionary files: 52
../../phs002551: Number of dictionary files: 2
../../phs002553: Number of dictionary files: 1
../../phs002561: Number of dictionary files: 2
../../phs002563: Number of dictionary files: 1
../../phs002565: Number of dictionary files: 6
../../phs002569: Number of dictionary files: 1
../../phs002570: Number of dictionary files: 1
../../phs002573: Number of dictionary files: 2
../../phs002583: Number of dictionary files: 8
../../phs002585: Number of dictionary files: 1
../..

Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso,subproject,phs_id,study_id,project_num,radx_id,filename,tier
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,,Identity,single,NCIT:C164337,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad Minimum CDE,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier1
1,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,http://edamontology.org/data:2531 NCIT:C25364,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
2,run_id,"Unique identifier for a run, e.g., a replicate...",,Sample,single,http://purl.allotrope.org/ontologies/result#AF...,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
3,specimen_type,Specimen type used in assay,saliva,Assay Results,single,http://doe-generated-ontology.com/OntoAD#C0370...,string,,"""breath""=[breath] | ""hand odor""=[hand odor] | ...",,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
4,breathing_type,"Type of breathing (tidal - normal breathing, r...",tidal,Results/Breath Analyzer,single,http://sweetontology.net/phenBiol/Breathing NC...,string,,"""tidal""=[tidal] | ""rapid""=[rapid]",,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2


In [5]:
print(f"Number of studies              : {dicts['study_id'].nunique()}")
print(f"Number of data elements        : {dicts.shape[0]}")
print(f"Number of unique data elements : {dicts['Id'].nunique()}")
dicts.head()

Number of studies              : 43
Number of data elements        : 13608
Number of unique data elements : 6048


Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso,subproject,phs_id,study_id,project_num,radx_id,filename,tier
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,,Identity,single,NCIT:C164337,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad Minimum CDE,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier1
1,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,http://edamontology.org/data:2531 NCIT:C25364,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
2,run_id,"Unique identifier for a run, e.g., a replicate...",,Sample,single,http://purl.allotrope.org/ontologies/result#AF...,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
3,specimen_type,Specimen type used in assay,saliva,Assay Results,single,http://doe-generated-ontology.com/OntoAD#C0370...,string,,"""breath""=[breath] | ""hand odor""=[hand odor] | ...",,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2
4,breathing_type,"Type of breathing (tidal - normal breathing, r...",tidal,Results/Breath Analyzer,single,http://sweetontology.net/phenBiol/Breathing NC...,string,,"""tidal""=[tidal] | ""rapid""=[rapid]",,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Novel Biosensing and VOC,phs002522.v1.p1,phs002522,1U01HL152410-01,rad_035_410-01,../../phs002522/rad_035_410-01_11845_Rapid1_DA...,tier2


In [6]:
os.makedirs(result_dir, exist_ok=True)

In [7]:
dicts.to_csv(os.path.join(result_dir, "data_elements.csv"), index=False)