In [None]:
# if any of the modules above are not already installed please use the command below in your notebook to install the module
# !pip install NameOfYourModule (e.g. !pip install pandas)

import pandas as pd
import numpy as np

import os

# Pre-defined functions

In [None]:
def get_pasc_category(diagnosis: pd.DataFrame, index: pd.DataFrame, PASC_definition_reference: pd.DataFrame, patid_column='syn_pt_id', category='ccsr_category'):
    '''get_pasc_category function finds the date of first instance of all PASC like diagnosis for each patient.
    The resulting dataframes from this function will be used to identify date of PASC diagnosis and subphenotypes. 

    Args:
        diagnosis (pd.DataFrame): standard diagnosis table from PCORnet CDM containing all diagnoses for patients.
        index (pd.DataFrame): custom index table created using a pre-defined function containing the index dates for each patient.
        PASC_definition_reference (pd.DataFrame): a reference spreadsheet containing all ICD-10 codes and diagnosis categories of PASC-like symptoms.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.
        category (str, optional): Diagnosis category column in the PASC_definition_reference table. Defaults to 'ccsr_category'.

    Returns:
        A tuple of two pandas dataframe. Both dataframes have one unique row per patient and each diagnosis category as a column. 
        categorized_diff: the values for each column is the time difference (in days) between the index date and the first instance of the diagnosis
        categorized_date: the date of first instance of the diagnosis
    '''

    # merge with index table to get the first instance of index event
    dx = pd.merge(
        diagnosis,
        index[[patid_column, 'index_date']],
        on=patid_column, how='inner'
    ).drop_duplicates()

    # calculate the difference in days between the diagnosis date and index event date
    # date_diff_from_index < 0 means the diagnosis was recorded before the index event date
    # date_diff_from_index > 0 means the diagnosis was recorded after the index event date
    dx['date_diff_from_index'] = (
        dx['admit_date'] - dx['index_date']) / np.timedelta64(1, 'D')

    # select the columns needed and drop duplicates
    dx.drop(columns=['site'], inplace=True)
    dx.drop_duplicates(inplace=True)

    # join to PASC_defintion to get the dx category if it is a PASC dx
    dx = pd.merge(
        dx,
        PASC_definition_reference[['i10_code', category]],
        left_on='dx',
        right_on='i10_code',
        how='inner'
    )

    # throw away any diagnoses in the blackout period and
    # balckout period is defined as 7 days before and 30 days after the index date
    dx = dx[
        ~(dx['date_diff_from_index'].between(-7, 30, inclusive='neither'))
    ]

    # throw away any diagnoses 180 days after the index date
    dx = dx[dx['date_diff_from_index'] <= 180]

    # select the necessary columns and drop the duplicates
    # by only including the CCSR category column (i.e. ccsr_category) and excluding the ICD-10 code column (i10_code)
    # we ensure that if there are several ICD-10 codes with the same category, we count them as the same
    dx = dx[[patid_column, 'date_diff_from_index', category, 'admit_date']].copy()
    dx.drop_duplicates(inplace=True)
    dx.reset_index(drop=True, inplace=True)

    # create a pivot table with each column representing the smallest value of date_diff_from_index
    # negative number means this is not a PASC diagnosis and it was previously present for this patient
    # positive number means this is a PASC diagnosis and the patient developed this diagnosis after index event date
    # 0 as a value means this diagnosis was developed at the same time as the index event date
    # NaN means the patient has never been diagnosed with this particular diagnosis
    categorized_diff = dx.pivot_table(
        index=[patid_column],
        columns=[category],
        values='date_diff_from_index',
        aggfunc='min')
    categorized_diff.drop_duplicates(inplace=True)

    # create a pivot table with each column representing the date of the first instance of a diagnosis in that category
    # NaN means the patient has never been diagnosed
    categorized_date = dx.sort_values(
        [patid_column, 'admit_date']).drop_duplicates(patid_column)
    categorized_date = categorized_date.pivot(
        index=[patid_column], columns=[category], values='admit_date')

    categorized_date.reset_index(inplace=True)
    categorized_diff.reset_index(level=patid_column, inplace=True)

    return categorized_diff, categorized_date


In [None]:
def get_pasc_subphenotype(pasc_diff: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_pasc_subphenotype function identifies one subphenotype per patient

    Args:
        pasc_diff (pd.DataFrame): the first returned result (i.e. categorized_diff) from get_pasc_category function
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        pd.DataFrame: a dataframe with a unique row per patient indicating one PASC subphenotype
    '''

    # set patid_column as the index
    temp_df = pasc_diff.copy()
    temp_df.set_index(patid_column, inplace=True)
    # replace negative values with nan to only focus on the real PASC diagnoses
    # negative values represent pre-existing diagnosis and are not PASC
    temp_df[temp_df < 0] = np.nan

    # find the column NAME that has the smallest value (.idxmin(axis=1))
    # column NAME will indicate the subphenotype name
    pasc_subphenotype = pd.DataFrame(temp_df.idxmin(
        axis=1, skipna=True), columns=['subphenotype_name'])

    # find the smallest column VALUE (.min(axis=1))
    # the smallest value across all columns indicate date difference (in days) between the index date and the first instance of PASC diagnosis
    pasc_subphenotype = pasc_subphenotype.merge(
        pd.DataFrame(temp_df.min(axis=1, skipna=True),
                     columns=['subphenotype_days']),
        on=patid_column,
        how='inner'
    )

    # resetting the index will make the patid_column to be a regular column rather than the index for this dataframe
    pasc_subphenotype.reset_index(inplace=True)

    # categorize the interval
    pasc_subphenotype['subphenotype_interval'] = np.select(
        [
            pasc_subphenotype['subphenotype_days'].between(30, 59, inclusive='both'), 
            pasc_subphenotype['subphenotype_days'].between(60, 89, inclusive='both'), 
            pasc_subphenotype['subphenotype_days'].between(90, 119, inclusive='both'),
            pasc_subphenotype['subphenotype_days'].between(120, 149, inclusive='left'), 
            pasc_subphenotype['subphenotype_days'] >= 150
        ], [
            '30-59', 
            '60-89', 
            '90-119', 
            '120-149', 
            '150+'
        ], default=np.NaN
    )

    pasc_subphenotype = pasc_subphenotype.query("~subphenotype_name.isnull()")
    pasc_subphenotype.reset_index(drop=True, inplace=True)

    return pasc_subphenotype


In [None]:
def get_pasc_pts(index:pd.DataFrame, pasc_yn:pd.DataFrame, pasc_subphenotype:pd.DataFrame, patid_column='syn_pt_id'):
    '''get_pasc_pts function takes in a series of custom tables resulting from other pre-defined function to generate a list of patients
    with their PASC status, subphenotype, and the index date. Please note this function only works for when the patient has one subphenotype.

    Args:
        index (pd.DataFrame): dataframe generated by get_index_event function.
        pasc_yn (pd.DataFrame): dataframe with information whether a diagnosis category is PASC or pre-existing.
        pasc_subphenotype (pd.DataFrame): dataframe generated by get_pasc_subphenotype function.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        pd.DataFrame: a dataframe with PASC and subphenotype information for all patients with an index date.
    '''

    # list of all patients with an index date
    pasc_pts = index[[patid_column, 'index_date']].copy()

    # dichotomous variable indicating PASC status
    pasc_yn.set_index(patid_column, inplace=True)
    pasc_pts['pasc_yn'] = np.where(pasc_pts[patid_column].isin(list(pasc_yn[(pasc_yn == 1).any(axis=1)].index)), 1, 0)
    pasc_yn.reset_index(inplace=True)

    pasc_pts = pd.merge(
        pasc_pts,
        pasc_subphenotype,
        on='syn_pt_id',
        how='left'
    )

    return pasc_pts

# Implementation
For this example, we will use a series of previously queried Parquet files containing data from several sites.

In [None]:
# main folder of the query
main_path = ""

# where the raw data will be saved
source_data_path = f"{main_path}/source data"
# create an empty folder if it does not exist
if os.path.exists(source_data_path) != True:
    os.makedirs(source_data_path)

# where the results will be saved
result_path = f"{main_path}/result"
# create an empty folder if it does not exist
if os.path.exists(result_path) != True:
    os.makedirs(result_path)

# where all external data needed for analysis is already saved (e.g. PASC definition spreadsheet)
external_source_path = ""


In [None]:
# a list of site names used in the analysis
# the site names should exactly match the schema names live in the database
site_names = ['mshs', 'wcm', 'nyu', 'montefiore', 'columbia']

# Study period start and end date (YYYY-MM-DD)
study_start_date = '2020-03-01'
study_end_date = '2022-07-30'


In [None]:
# Read the necessary tables if the tables are not already loaded in your environment 
# index = pd.read_csv("")
# index = pd.read_parquet("")

# diagnosis_all = pd.read_csv("")
# diagnosis_all = pd.read_parquet("")

## PASC definition reference spreadsheet
_PASC_subphenotype.xlsx_ is a spreadsheet that contains at least the following information about the PASC like diagnoses:
* **i10_code**: ICD-10 codes, and the CCSR category
* **ccsr_category**: diagnosis category of each ICD-10 code
* **pasc_name_simple**: simplified name of PASC category
* additional columns providing further information about the diagnosis but not being used

Please note that the PASC definition reference spreadsheet you are using must have at least the three columns mentioned above. Please ensure the column names match the expected columns names.

The data analysis pipeline uses the list of diagnoses in this spreadsheet to identify PASC. Please remove any diagnosis that should not be part of the reference list.


In [None]:
PASC_definition = pd.read_excel(
    f'{external_source_path}/PASC_subphenotype.xlsx')

# please reference the correct column names in your spreadsheet if using a different one
PASC_definition.rename(columns={
    'ICD-10-CM Code_clean': 'i10_code',
    'pasc': 'ccsr_category',
    'PASC Name Simple': 'pasc_name_simple'
}, inplace=True
)

# a flag to filter any diagnosis that does not meet the stringent definition
# you may comment this line out if the spreadsheet you are using already contains the diagnoses of interest
PASC_definition = PASC_definition[PASC_definition['selected stringent'] == 1]


In [None]:
system_crosswalk = {
  "Abdominal pain and other digestive/abdomen signs and symptoms": "Digestive" 
, "Acute phlebitis; thrombophlebitis and thromboembolism": "Circulatory" 
, "Acute pulmonary embolism": "Circulatory" 
, "Anemia": "Blood" 
, "Circulatory signs and symptoms": "Circulatory" 
, "Diabetes mellitus with complication": "Endocrine" 
, "Fever": "General" 
, "Fluid and electrolyte disorders": "Endocrine" 
, "Headache; including migraine": "neurological" 
, "Malaise and fatigue": "neurological" 
, "Malnutrition": "Endocrine" 
, "Musculoskeletal pain, not low back pain": "Musculoskeletal" 
, "Nervous system signs and symptoms": "neurological" 
, "Neurocognitive disorders": "neurological" 
, "Nonspecific chest pain": "Circulatory" 
, "Other general signs and symptoms": "Endocrine" 
, "Other nervous system disorders (neither hereditary nor degenerative)": "neurological" 
, "Other specified and unspecified gastrointestinal disorders": "Digestive" 
, "Other specified and unspecified lower respiratory disease": "Respiratory" 
, "Other specified and unspecified skin disorders": "Skin" 
, "Other specified upper respiratory infections": "Respiratory" 
, "PASC-General": "PASC Diagnosis" 
, "Pressure ulcer of skin": "Skin" 
, "Respiratory signs and symptoms": "Respiratory" 
, "Sleep wake disorders": "neurological"
}

In [None]:
color_crosswalk = {
  "Abdominal pain and other digestive/abdomen signs and symptoms": "brown" 
, "Acute phlebitis; thrombophlebitis and thromboembolism": "crimson" 
, "Acute pulmonary embolism": "crimson" 
, "Anemia": "orange" 
, "Circulatory signs and symptoms": "crimson" 
, "Diabetes mellitus with complication": "lightgreen" 
, "Fever": "lightgrey" 
, "Fluid and electrolyte disorders": "lightgreen" 
, "Headache; including migraine": "skyblue" 
, "Malaise and fatigue": "skyblue" 
, "Malnutrition": "lightgreen" 
, "Musculoskeletal pain, not low back pain": "pink" 
, "Nervous system signs and symptoms": "skyblue" 
, "Neurocognitive disorders": "skyblue" 
, "Nonspecific chest pain": "crimson" 
, "Other general signs and symptoms": "lightgreen" 
, "Other nervous system disorders (neither hereditary nor degenerative)": "skyblue" 
, "Other specified and unspecified gastrointestinal disorders": "brown" 
, "Other specified and unspecified lower respiratory disease": "tan" 
, "Other specified and unspecified skin disorders": "thistle" 
, "Other specified upper respiratory infections": "tan" 
, "PASC-General": "black" 
, "Pressure ulcer of skin": "thistle" 
, "Respiratory signs and symptoms": "tan" 
, "Sleep wake disorders": "skyblue" 
}

In [None]:
# assign an organ system and a color (for visualizations) based on the crosswalks created earlier
PASC_definition['system'] = PASC_definition['ccsr_category'].map(system_crosswalk)
PASC_definition['color'] = PASC_definition['ccsr_category'].map(color_crosswalk)

## PASC diagnoses
The first step is to prepare the standard DIAGNOSIS table from PCORnet CDM to only include PASC like diagnoses of interest

In [None]:
# by inner joining the table with ALL diagnoses with PASC definition reference spreadsheet
# we are creating a smaller subset of the diagnosis table containing only the PASC like diagnoses
# this step may not be necessary, but will help to optimize the query and its later functions
pasc_diagnoses = pd.merge(
    diagnosis_all,
    PASC_definition[['i10_code']],
    left_on='dx',
    right_on='i10_code', 
    how='inner'
)
# dropping duplicated column
pasc_diagnoses.drop(columns=('i10_code'), inplace=True)


## PASC category
In this step run the main algorithm in get_pasc_category function that identifies whether a recorded diagnosis is an existing one or was developed after the index date for each patient

In [None]:
pasc_diff, pasc_date = get_pasc_category(
    diagnosis=pasc_diagnoses,
    # custom index table created using a pre-defined function containing the index dates for each patient.
    index=index, 
    PASC_definition_reference=PASC_definition,
    category='ccsr_category',
    patid_column='syn_pt_id'
)

## PASC subphenotype
In this step you can assign one PASC subphenotype per patient

In [None]:
pasc_subphenotype = get_pasc_subphenotype(
    pasc_diff=pasc_diff,
    patid_column='syn_pt_id'
)

## PASC status
Up to this point, you have all the information needed for you to make a determination regarding whether the patient has developed PASC or not. 

The code snippet below takes the returned result (i.e. pasc_diff) of get_pasc_category function and replace the positive values with 1 and negative values with -1 and remaining NULL values with 0. This allow us to quickly identify whether an existing diagnosis counts as pasc (1 value) or it is a pre-existing diagnosis recorded prior to the index date (-1 value). The 0 value in a column means the patient has never been diagnosed with this diagnosis.

In [None]:
pasc_yn = pasc_diff.copy()

# get a list of all columns (diagnosis categories) avoiding patid column in the first position
col_list_ccsr = pasc_yn.columns[1:]

# if +1 means it's pasc dx
# if -1 means it's existing dx
# if 0 means never been diagnosed
pasc_yn[col_list_ccsr] = pasc_yn[col_list_ccsr].apply(lambda x: [1 if y > 30 else (-1 if y < -7 else 0) for y in x])