# This notebook converts data dictionaries in the  RADx-rad Data Dictionary format to the harmonized RADx Data Dictionary format.

### Setup

In [1]:
import os
import glob
import pandas as pd
import re
from utils import *

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns
pd.set_option('display.max_colwidth', None) # don't truncate wide columns

In [3]:
enum_pattern_int = r"(\d+),\s*([^|]+)\s*(?:\||$)" # Example: 1, Male | 2, Female | 3, Intersex | 4, None of these describe me
enum_pattern_str = r"([A-Z]+),\s*([^|]+)\s*(?:\||$)" # Example: AL, Alabama | AK, Alaska | AS, American Samoa

In [4]:
required_fields = {"Variable / Field Name", "Field Label", "Section Header", "Field Type", "Unit", "Choices, Calculations, OR Slider Labels", "Field Note", "CDE Reference"}

In [5]:
column_map = {"Variable / Field Name": "Id", "Field Label": "Label", "Section Header": "Section", "Field Type": "Datatype", "Unit": "Unit", "Choices, Calculations, OR Slider Labels": "Enumeration", "Field Note": "Notes"}

## Download RADx-rad data elements

In [6]:
file_path = "../data/dictionaries/RADx-rad_Dictionary_Template.csv"
data_elements = pd.read_csv(file_path, dtype=str)
data_elements.fillna("", inplace=True)

In [7]:
check_required_fields(data_elements)

In [8]:
### Check that field types match the expected types
check_field_types(data_elements)

In [9]:
### Check that the following fields contain data, they cannot be empty.
check_empty_field("Variable / Field Name", data_elements)
check_empty_field("Section Header", data_elements)
check_empty_field("Field Type", data_elements)
check_empty_field("Field Label", data_elements)
check_empty_field("CDE Reference", data_elements)

In [10]:
data_elements.rename(columns=column_map, inplace=True)

In [11]:
data_elements.columns

Index(['Id', 'Section', 'Datatype', 'Label', 'Enumeration', 'Notes',
       'Text Validation Type OR Show Slider Number', 'Text Validation Min',
       'Text Validation Max', 'Branching Logic (Show field only if...)',
       'Unit', 'CDE Reference', 'template'],
      dtype='object')

In [12]:
data_elements = data_elements[["Id", "Label", "Section", "Datatype", "Unit", "Enumeration", "Notes", "CDE Reference"]]

In [13]:
data_elements.drop_duplicates(inplace=True)
print("Number of data elements", data_elements.shape[0])

Number of data elements 840


In [14]:
data_elements.head()

Unnamed: 0,Id,Label,Section,Datatype,Unit,Enumeration,Notes,CDE Reference
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,Identity,text,,,,RADx-rad Minimum CDE
1,race,What is your race? Mark one or more boxes.,Race,checkbox,,"1, American Indian or Alaska Native | 2, Asian | 3, Black or African American | 4, Native Hawaiian or Other Pacific Islander | 5, White | 6, Some other race",,RADx-rad Minimum CDE
2,ethnicity,Are you of Hispanic or Latino origin?,Ethnicity,radio,,"1, Yes, of Hispanic or Latino origin | 0, No, not of Hispanic or Latino origin",,RADx-rad Minimum CDE
3,age,What is your age?,Age,integer,,,"Age in years. For babies less than 1 year old, write 0 as the age",RADx-rad Minimum CDE
4,sex,What is your biological sex assigned at birth?,Sex,radio,,"1, Male | 2, Female | 3, Intersex | 4, None of these describe me",,RADx-rad Minimum CDE


In [15]:
data_elements["Datatype"].value_counts()

text        403
float       181
category     84
integer      56
radio        46
url          24
sequence     19
list         14
date          6
time          4
dropdown      2
checkbox      1
Name: Datatype, dtype: int64

In [16]:
def set_cardinality(data_type):
    if data_type == "list":
        return "multiple"
    else:
        return "single"

In [17]:
def parse_integer_enums(enum):
    # Example: 1, Male | 2, Female | 3, Intersex | 4, None of these describe me
    matches = re.findall(enum_pattern_int, enum)
    parsed_data = [(int(match[0]), match[1].strip()) for match in matches]
    return parsed_data

In [18]:
def parse_string_enums(enum):
    # Example: AL, Alabama | AK, Alaska | AS, American Samoa
    matches = re.findall(enum_pattern_str, enum)
    parsed_data = [(match[0].strip(), match[1].strip()) for match in matches]
    return parsed_data

In [19]:
def convert_data_type(row):
    data_type = row["Datatype"]
    enum = row["Enumeration"]
    
    parsed_data = parse_integer_enums(enum)
    if len(parsed_data) > 0:
        return "integer"

    parsed_data = parse_string_enums(enum)
    if len(parsed_data) > 0:
        return "string"
    
    # find enumeration with text values
    if "|" in enum:
        return "string"
    
    if data_type in ["text", "list", "url", "sequence", "category", "yesno", "radio", "dropdown", "checkbox", "zipcode"]:
        return "string"
    
    return data_type  

In [20]:
def convert_enumeration(enum):
    
    # parse integer and string encoded enumerations
    parsed_data = parse_integer_enums(enum) + parse_string_enums(enum)
    
    if parsed_data and len(parsed_data) > 0:
        enums = []
        for value, label in parsed_data:
            enums.append(f'"{value}"=[{label}]')
            
        return " | ".join(enums)
    
    # parse simple value enumerations. Example: IgA | IgG | IgM
    if "|" in enum:
        enums = []
        values = enum.split("|")
        for value in values:
            value = value.strip()
            enums.append(f'"{value}"=[{value}]')

        return " | ".join(enums)
            
    
    return ""

In [21]:
data_elements["Cardinality"] = data_elements["Datatype"].apply(set_cardinality)
data_elements["Datatype"] = data_elements.apply(convert_data_type, axis=1)
data_elements["Enumeration"] = data_elements["Enumeration"].apply(convert_enumeration)

In [22]:
#data_elements.drop(1, axis=0, inplace=True)

In [23]:
data_elements.columns

Index(['Id', 'Label', 'Section', 'Datatype', 'Unit', 'Enumeration', 'Notes',
       'CDE Reference', 'Cardinality'],
      dtype='object')

In [24]:
data_elements = data_elements[["Id", "Label", "Section", "Cardinality", "Datatype", "Unit", "Enumeration", "Notes", "CDE Reference"]]

In [25]:
data_elements.head(1000)

Unnamed: 0,Id,Label,Section,Cardinality,Datatype,Unit,Enumeration,Notes,CDE Reference
0,study_id,RADx-rad Study ID; Subject ID; Datavent ID,Identity,single,string,,,,RADx-rad Minimum CDE
1,race,What is your race? Mark one or more boxes.,Race,single,integer,,"""1""=[American Indian or Alaska Native] | ""2""=[Asian] | ""3""=[Black or African American] | ""4""=[Native Hawaiian or Other Pacific Islander] | ""5""=[White] | ""6""=[Some other race]",,RADx-rad Minimum CDE
2,ethnicity,Are you of Hispanic or Latino origin?,Ethnicity,single,integer,,"""1""=[Yes, of Hispanic or Latino origin] | ""0""=[No, not of Hispanic or Latino origin]",,RADx-rad Minimum CDE
3,age,What is your age?,Age,single,integer,,,"Age in years. For babies less than 1 year old, write 0 as the age",RADx-rad Minimum CDE
4,sex,What is your biological sex assigned at birth?,Sex,single,integer,,"""1""=[Male] | ""2""=[Female] | ""3""=[Intersex] | ""4""=[None of these describe me]",,RADx-rad Minimum CDE
5,education,How many years of education have you completed?,Education,single,integer,,,Years of education from 0 - 20+,RADx-rad Minimum CDE
6,zip,Zip or Postal Code:,Domicile,single,string,,,De-Identified zip code,RADx-rad Minimum CDE
7,employment,Are you employed?,Employment,single,integer,,"""1""=[Employed in a permanent position] | ""2""=[Employed in a temporary position] | ""3""=[Not currently employed]",,RADx-rad Minimum CDE
8,insurance,What kind of health insurance do you have?,Insurance Status,single,integer,,"""1""=[Private insurance] | ""2""=[Public insurance] | ""3""=[None]",,RADx-rad Minimum CDE
9,deaf,Are you deaf or do you have serious difficulty hearing?,Disability Status,single,integer,,"""1""=[Yes] | ""0""=[No]",,RADx-rad Minimum CDE


In [26]:
data_elements.to_csv("RADx-rad_harmonized_dict_2023-08-18.csv", index=False)