# This notebook converts data dictionaries in the  RADx-rad Data Dictionary format to the harmonized RADx Data Dictionary format.

### Setup

In [1]:
import os
import glob
import pandas as pd
import re
from utils import *

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns
pd.set_option('display.max_colwidth', None) # don't truncate wide columns

In [3]:
enum_pattern_int = r"(\d+),\s*([^|]+)\s*(?:\||$)" # Example: 1, Male | 2, Female | 3, Intersex | 4, None of these describe me
enum_pattern_str = r"([A-Z]+),\s*([^|]+)\s*(?:\||$)" # Example: AL, Alabama | AK, Alaska | AS, American Samoa

In [4]:
required_fields = {"Variable / Field Name", "Field Label", "Section Header", "Field Type", "Unit", "Choices, Calculations, OR Slider Labels", "Field Note", "CDE Reference"}

In [5]:
column_map = {"Variable / Field Name": "Id", "Field Label": "Label", "Section Header": "Section", "Field Type": "Datatype", "Unit": "Unit", "Choices, Calculations, OR Slider Labels": "Enumeration", "Field Note": "Notes"}

## Download RADx-rad data elements

In [6]:
files = glob.glob("../data/dictionaries/*.csv")

In [7]:
error_list = []

In [8]:
for filename in files:
    data_elements = pd.read_csv(filename, dtype=str)
    data_elements.fillna("", inplace=True)
    
    # Run data checks
    missing_fields = check_required_fields(data_elements)
    
    # Check that the following fields contain data, they cannot be empty.
    empty_field_name = check_empty_field("Variable / Field Name", data_elements)
    empty_section_header = check_empty_field("Section Header", data_elements)
    empty_field_type = check_empty_field("Field Type", data_elements)
    empty_field_label = check_empty_field("Field Label", data_elements)
    empty_cde_reference = check_empty_field("CDE Reference", data_elements)
    
    # Check that field types match the expected types
    invalid_field_types = check_field_types(data_elements)
    
    # Record results
    error_list.append({"filename": filename,
                       "missing_fields": missing_fields, 
                       "empty_field_name": empty_field_name, 
                       "empty_section_header": empty_section_header,
                       "empty_field_type": empty_field_type,
                       "empty_field_label": empty_field_label,
                       "empty_cde_reference": empty_cde_reference,
                       "invalid_field_types": invalid_field_types})

ERROR: Data field missing: {'CDE Reference'}
INFO : Extra fields: {'Text Validation Type OR Show Slider Number', 'template', 'Text Validation Max', 'Branching Logic (Show field only if...)', 'Text Validation Min', 'Reference'}
ERROR: Data missing in field: Variable / Field Name
ERROR: Data missing in field: Section Header
ERROR: Data missing in field: Field Type
ERROR: Data missing in field: Field Label
ERROR: Data field missing: CDE Reference
ERROR: Invalid Field Type: {'', 'biginteger'}
INFO : Allowed Field Types: {'checkbox', 'float', 'url', 'integer', 'time', 'text', 'category', 'radio', 'timezone', 'sequence', 'zipcode', 'yesno', 'date', 'dropdown', 'list'}


In [9]:
errors = pd.DataFrame(error_list)
errors.head()

Unnamed: 0,filename,missing_fields,empty_field_name,empty_section_header,empty_field_type,empty_field_label,empty_cde_reference,invalid_field_types
0,../data/dictionaries/RADx-rad_Dictionary_Template.csv,{},False,False,False,False,False,{}
1,../data/dictionaries/test_dict_with_errors.csv,{CDE Reference},True,True,True,True,True,"{, biginteger}"


In [10]:
#data_elements.to_csv(", index=False)