In [1]:
import glob
import os
import pandas as pd
import utils

In [2]:
# NIH Data Hub path (if used)
data_path = "../../phs*/"  

# Local data path
data_path = "../data/phs*/"  

# Results directory
result_dir = "../../results"  # or RESULT_DIR if truly constant

# RADx-rad data elements
tier1_harmonized_dict = "https://github.com/radxrad/common-data-elements/raw/refs/heads/main/cdes/RADx-rad_tier1_dict_2025-03-19.csv"
tier2_harmonized_dict = "https://github.com/radxrad/common-data-elements/raw/refs/heads/main/cdes/RADx-rad_tier2_dict_2025-03-19.csv"

In [3]:
def get_data_elements(dict_file_path):
    df_list = []

    for directory in glob.glob(dict_file_path):
        for dict_file in glob.glob(os.path.join(directory, "*DICT_origcopy.csv")):
            # Collect dictionary data
            df = pd.read_csv(dict_file, dtype=str, keep_default_na=False)

            # Collect metadata
            meta_file = dict_file.replace("_DICT_origcopy.csv", "_META_origcopy.json")
            subproject, phs_identifier, project_num, _ = utils.extract_fields_from_metadata(meta_file)
            df["subproject"] = subproject
            df["phs_id"] = phs_identifier
            df["study_id"] = os.path.basename(os.path.dirname(meta_file))
            df["project_num"] = project_num
            df["radx_id"] = utils.extract_radx_id(meta_file)
            df["filename" ] = os.path.basename(dict_file)

            df_list.append(df)

    data = pd.concat(df_list)
    data.drop_duplicates(inplace=True)
    data = utils.assign_data_element_tier(data, tier1_harmonized_dict, tier2_harmonized_dict)
    
    return data

In [4]:
dicts = get_data_elements(data_path)
dicts.head()

Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso,subproject,phs_id,study_id,project_num,radx_id,filename,tier
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,,Identity,single,NCIT:C164337,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad Minimum CDE,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier1
1,cohort_id,Identifier for groups of patients,,Clinical,multiple,NCIT:C183331,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
2,study_population,The subjects/patients (and specimen types) inc...,,Clinical Performance,multiple,NCIT:C70833,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,https://www.fda.gov/files/medical%20devices/pu...,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
3,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,http://edamontology.org/data:2531 NCIT:C25364,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
4,technology_platform,Abbreviation or short label for technology,ddPCR,Technology Metadata,single,NCIT:C45378,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2


In [5]:
print(f"Number of studies              : {dicts['study_id'].nunique()}")
print(f"Number of data elements        : {dicts.shape[0]}")
print(f"Number of unique data elements : {dicts['Id'].nunique()}")
dicts.head()

Number of studies              : 1
Number of data elements        : 723
Number of unique data elements : 272


Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso,subproject,phs_id,study_id,project_num,radx_id,filename,tier
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,,Identity,single,NCIT:C164337,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad Minimum CDE,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier1
1,cohort_id,Identifier for groups of patients,,Clinical,multiple,NCIT:C183331,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
2,study_population,The subjects/patients (and specimen types) inc...,,Clinical Performance,multiple,NCIT:C70833,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,https://www.fda.gov/files/medical%20devices/pu...,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
3,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,http://edamontology.org/data:2531 NCIT:C25364,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2
4,technology_platform,Abbreviation or short label for technology,ddPCR,Technology Metadata,single,NCIT:C45378,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Exosome,phs002544.v1.p1,phs002544,1U18TR003778-01,rad_018_778-01,rad_018_778-01_NAB_CLINICALSAMPLERESULTS_DICT_...,tier2


In [6]:
os.makedirs(result_dir, exist_ok=True)

In [7]:
dicts.to_csv(os.path.join(result_dir, "data_elements.csv"), index=False)