# CheckDataElementTemplates
This notebooks check the RADx-rad Data Element template files for empty fields and does consistency checks.

Author: Peter W. Rose, pwrose@ucsd.edu

In [1]:
import pandas as pd
from utils import download_data_element_templates, check_empty_field, check_whitespace, check_field_types

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option('display.max_colwidth', None)

### Download the RADx-rad Data Element template files
The templates are concatenated into a single dataframe.

In [3]:
data_elements = download_data_element_templates()

# Check for empty columns
The following columns must contain text.

In [4]:
check_empty_field("Variable / Field Name", data_elements)

False

In [5]:
check_empty_field("Section Header", data_elements)

False

In [6]:
check_empty_field("Field Type", data_elements)

False

In [7]:
check_empty_field("Field Label", data_elements)

False

In [8]:
check_empty_field("CDE Reference", data_elements)

False

# Check for whitespace in Field Names
Field Names should not contain whitespace.

In [9]:
# data_elements

In [10]:
check_whitespace("Variable / Field Name", data_elements)

Variable / Field Name: ok


# Check Field Types
Check that field types match the defined field types.

In [11]:
check_field_types(data_elements)

''

In [12]:
data_elements["CDE Reference"].value_counts()

CDE Reference
RADx-rad DCC                                                                                                                                                                                                         610
NWSS_DCIPHER_Data_Dictionary_v2.0.0_20210319|https://www.cdc.gov/nwss/reporting.html                                                                                                                                  93
RADx-rad Minimum CDE                                                                                                                                                                                                  48
RADx-rad DCC|https://www.ncbi.nlm.nih.gov/taxonomy                                                                                                                                                                    44
RADx-rad DCC|https://www.medcalc.org/calc/diagnostic_test.php                                                         

## Spell check

In [13]:
import re
from spellchecker import SpellChecker

# Initialize spell checker
spell = SpellChecker()

# Function to check for typos in a string
def check_typos(text):
    # Split by whitespace, comma, semicolon, |, (, )
    words = re.split(r'[,.\s;:|()?/-_]+', text)
    misspelled = spell.unknown(words)
    return misspelled

# Check for typos
# variable_typos = data_elements["Variable / Field Name"].apply(check_typos)
# variable_typos

# field_label_typos = data_elements["Field Label"].apply(check_typos)
# field_label_typos

# section_typos = data_elements["Section Header"].apply(check_typos)
# section_typos

# choices_typos = data_elements["Choices, Calculations, OR Slider Labels"].apply(check_typos)
# choices_typos

# notes_typos = data_elements["Field Note"].apply(check_typos)
# notes_typos