In [None]:
# if any of the modules above are not already installed please use the command below in your notebook to install the module
# !pip install NameOfYourModule (e.g. !pip install pandas)

import pandas as pd
import numpy as np
import datetime

# Pre-defined functions

In [None]:
def categorize_age(df: pd.DataFrame, age_column: str):
    '''categorize_age function takes a table containing a column with age of the patient and categorize the patient's age.

    Args:
        df (pd.DataFrame): Any dataframe with an age column.
        age_column (str): Name of the column that contains the age of the patient. The column values should be int or float. This is often age of patient as of index event.

    Returns:
        pd.series: returns a series that can be directly assigned as a new column to any dataframe.
    '''

    age_group = np.select(
        [
            round(df[age_column]).between(0, 1, inclusive='left'),
            round(df[age_column]).between(1, 4, inclusive='both'),
            round(df[age_column]).between(5, 9, inclusive='both'),
            round(df[age_column]).between(10, 15, inclusive='both'),
            round(df[age_column]).between(16, 20, inclusive='both'),
            round(df[age_column]).between(21, 35, inclusive='both'),
            round(df[age_column]).between(36, 45, inclusive='both'),
            round(df[age_column]).between(46, 55, inclusive='both'),
            round(df[age_column]).between(56, 65, inclusive='both'),
            round(df[age_column]) > 65
        ],
        [
            '<1',
            '1-4',
            '5-9',
            '10-15',
            '16-20',
            '21-35',
            '36-45',
            '46-55',
            '56-65',
            '66+'
        ],
        default='unknown'
    )

    return age_group

In [None]:
def clean_sex(df: pd.DataFrame, sex_column='sex'):
    '''clean_sex function replaces PCORnet CDM value sets of sex with a human-readble value taken from the official PCORnet CDM dictionary. 

    Args:
        df (pd.DataFrame): Any dataframe with ethnicity column with standard reference terminology values of PCORnet CDM. This is often the standard DEMOGRAPHIC table.
        sex_column (str, optional): Name of the column containing the sex information. Defaults to 'sex'.

    Returns:
        pd.DataFrame: the same input dataframe (i.e. df) with the values of sex_column replaced accordingly.
    '''
    df.replace({
        sex_column: {
            'A': 'Other/Missing/Unknown',
            'F': 'Female',
            'M': 'Male',
            'NI': 'Other/Missing/Unknown',
            'UN': 'Other/Missing/Unknown',
            'OT': 'Other/Missing/Unknown'
        }}, inplace=True)

    return df


In [None]:
def clean_race(df: pd.DataFrame, race_column='race'):
    '''clean_race function replaces PCORnet CDM value sets of race with a human-readble value taken from the official PCORnet CDM dictionary.

    Args:
        df (pd.DataFrame): Any dataframe with RACE column with standard reference terminology values of PCORnet CDM. This is often the standard DEMOGRAPHIC table.
        race_column (str, optional): Name of the column containing the race information. Defaults to 'race'.

    Returns:
        pd.DataFrame: the same input dataframe (i.e. df) with the values of race_column replaced accordingly.
    '''

    df.replace({
        race_column: {
            '01': 'American Indian or Alaska Native',
            '1': 'American Indian or Alaska Native', # not a standard reference terminology
            '02': 'Asian',
            '2': 'Asian',  # not a standard reference terminology
            '03': 'Black or African American',
            '3': 'Black or African American',  # not a standard reference terminology
            '04': 'Native Hawaiian or Other Pacific Islander',
            '4': 'Native Hawaiian or Other Pacific Islander', # not a standard reference terminology
            '05': 'White',
            '5': 'White',  # not a standard reference terminology
            '06': 'Multiple race',
            '6': 'Multiple race',  # not a standard reference terminology
            '07': 'Refuse to answer',
            '7': 'Refuse to answer',  # not a standard reference terminology
            'NI': 'No race information',
            '0': 'Unknown',  # not a standard reference terminology
            'UN': 'Unknown',
            'OT': 'Other'
        }}, inplace=True)

    return df


In [None]:
def clean_ethnicity(df: pd.DataFrame, ethnicity_column='hispanic'):
    '''clean_ethnicity function replaces PCORnet CDM value sets of ethnicity with a human-readble value taken from the official PCORnet CDM dictionary.

    Args:
        df (pd.DataFrame): Any dataframe with ethnicity column with standard reference terminology values of PCORnet CDM. This is often the standard DEMOGRAPHIC table.
        ethnicity_column (str, optional): Name of the column containing the ethnicity information. Defaults to 'hispanic'.

    Returns:
        pd.DataFrame: the same input dataframe (i.e. df) with the values of ethnicity_column replaced accordingly.
    '''

    df.replace({
        ethnicity_column: {
            'Y': 'Hispanic',
            'N': 'Not hispanic',
            'R': 'Refuse to answer',
            'NI': 'No ethnicity information',
            'UN': 'Unknown',
            'OT': 'Other'
        }}, inplace=True)

    return df


In [None]:
def categorize_race_ethnicity(df: pd.DataFrame, ethnicity_column='hispanic', race_column='race'):
    '''categorize_race_ethnicity function uses the already processed race and ethnicity values to combine and categorize the patients per qtwg's categories.

    Args:
        df (pd.DataFrame): Any dataframe with ethnicity column with standard reference terminology values of PCORnet CDM. This is often the standard DEMOGRAPHIC table.
        ethnicity_column (str, optional): Name of the column containing the ethnicity information. Defaults to 'hispanic'.
        race_column (str, optional): Name of the column containing the race information. Defaults to 'race'.

    Returns:
        pd.series: returns a series that can be directly assigned as a new column to any dataframe.
    '''

    race_ethnicity = np.select(
        [
            ((df[ethnicity_column].isin(['Not hispanic'])) & (df[race_column].isin(['White']))),
            ((df[ethnicity_column].isin(['Not hispanic'])) & (df[race_column].isin(['Black or African American']))),
            (df[ethnicity_column].isin(['Hispanic'])),
            ((df[ethnicity_column].isin(['Not hispanic'])) & (df[race_column].isin(['Asian']))),
            (
                (df[race_column].isin(['Native Hawaiian or Other Pacific Islander']))
                | (df[race_column].isin(['American Indian or Alaska Native']))
                | (df[ethnicity_column].isin(['Other']))
                | (df[race_column].isin(['Other', 'Multiple race']))
            ), 
            (
                (df[ethnicity_column].isin(
                    ['Unknown', 'Refuse to answer', 'No ethnicity information', '']))
                | (df[race_column].isin(['Unknown', 'Refuse to answer', 'No race information', '']))
            )
        ], [
            'Non-Hispanic white',
            'Non-Hispanic black',
            'Hispanic',
            'Non-hispanic Asian',
            'Other',
            'Missing/Unknown'
        ], default='ISSUE WITH RACE OR ETHNICITY COLUMN'
    )

    return race_ethnicity


# Implementation
For this example, we will use a previously queried Demographic data contianing several sites.

In [None]:
# Read a demographic table if the demographic table is not already loaded in your environment 
# demographic = pd.read_parquet("")
# demographic = pd.read_csv("")


In [None]:
# calculate age as of today then categorize the age
# please note, for CSC queries you may often need to calcualte age at the time of index event unless stated otherwise
demographic['age_as_of_today'] = (datetime.date.today() - demographic['birth_date']) / np.timedelta64(1, 'Y')
demographic['age_as_of_today_group'] = categorize_age(df=demographic, age_column='age_as_of_today')

# clean SEX column
demographic = clean_sex(df=demographic, sex_column='sex')

# clean RACE and HISPANIC column then categorize patients based on race and ethnicity combined 
demographic = clean_race(df=demographic, race_column='race')
demographic = clean_ethnicity(df=demographic, ethnicity_column='hispanic')
demographic['race_ethinicity'] = categorize_race_ethnicity(df=demographic, race_column='race', ethnicity_column='hispanic')
