In [1]:
# if any of the modules above are not already installed please use the command below in your notebook to install the module
# !pip install NameOfYourModule (e.g. !pip install pandas)

import pandas as pd
import numpy as np

import os

# INDEX_ALL table

## Definition
The index_all table is a dataframe that contains all instances of COVID-19 indications. This table is meant to facilitate and optimize the query process by pre-compiling a list of patients who have any of the COVID-19 indication in the database. You may point all of your queries to the list of patients in this table for each site to reduce the initial cohort of your analysis and only focus on patients relevant with at least one of the COVID-19 inidcations described below.

Be aware if you choose to not use this pre-compiled list of patients as your starting point, you may significantly increase the number of the initial patient cohort of your analysis and subsequently the size of your data in memory. You may be required to pull all patients' data in memory to find the COVID-19 indications from various tables such as diagnosis and lab_result_cm.

The current COVID-19 indications included in this table are:
1. COVID-19 diagnosis and the setting it was recorded in
2. COVID-19 PCR or atigen test
3. Paxlovid prescription

## Format
site | syn_pt_id |index_date |index_type | index_result | enc_type
-----|-----|-----|-----|-----|-----

## Dictionary
* site: site's name
* syn_pt_id: unique patient identifier. Combination of site's name and patid column in PCORnet CDM. 
    * [site]_[patid]. e.g. WCM_123456 
* index_date: the date when COVID-19 indication was recorded
* index_type: the type of COVID-19 indication as stated above. Possible values are
    * **covid_dx** to represent COVID-19 diagnosis (U07.1)
    * **lab** to represent COVID-19 PCR or atigen test
    * **paxlovid** to represent Paxlovid prescription
* index_result: this is only populated if the index_type is lab and NULL for others. Possible values are **positive** or **negative** 
* enc_type: this is only populated if the index_type is covid_dx and NULL for others. Possible values match the enc_type values in the DIAGNOSIS table in the PCORnet CDM 


## How to create

### In database using SQL query
This table is already available for majority of the sites live in the database under **qtwg** schema. The script below generates the SQL query to create this table in the database for each site (i.e. qtwg.[SITE]_[index_all]). For example, WCM's index_all table is in **qtwg.wcm_index_all**. 

In [None]:
# this is to dynamically create a set of SQL queries to create permenant tables contianing index events for each site
# the result should be ran in postgres sql client (pgAdmin)
# open in a text editor then copy in PgAdmin to run

counter = 1

final_query = """
--=====================--
--===== index_all =====--
--=====================--
"""

for i in site_names:

    if i == 'ochsner':
        rx_start_date = "to_date(rx_start_date, 'YYYY-MM-DD')"
    else:
        rx_start_date = "rx_start_date"

    query = f"""
--==============================================================================================================--
--============================================== {i} ===================================================--
--==============================================================================================================--
DO $$ begin raise notice '{counter}.processing {i}'; end; $$;


DROP TABLE IF EXISTS qtwg.{i}_index_all;

CREATE TABLE qtwg.{i}_index_all
(
    patid varchar(200),
    enc_type varchar(50),
    index_date date,
    index_type varchar(50),
    index_result varchar(50)
);



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select labs.patid, NULL as enc_type
    , result_date as index_date
    , 'lab' as index_type
    , case when result_qual in ('POSITIVE', 'DETECTED', 'PRESUMPTIVE POSITIVE') then 'positive' else 'negative' end as index_result
from {i}.lab_result_cm labs
join public.covid_tests loinc
    on loinc.concept_code = labs.lab_loinc
    and loinc.type = 'Ag';



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select dx.patid, enc_type
    , admit_date as index_date
    , 'covid_dx' as index_type
    , null as index_result
from {i}.diagnosis dx
where dx in ('U07.1');



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select patid, NULL as enc_type, {rx_start_date} as rx_start_date
, 'paxlovid' as index_type, NULL as index_result
from {i}.prescribing
where {rx_start_date} >= '2020-03-01' 
and rxnorm_cui in (
  '2587899', '2587898', '2587892', '2587897', '2587893', '2587896', '2587894', '2587895'
, '69108530', '69108506', '2587899', '2587898', '2587892', '2587897', '2587893', '2587896'
, '2587894', '2587895', '85762', '199249', '316647', '317488', '331537', '597728', '746644'
, '1926065', '1926066', '317150', '332450', '900575', '900574', '1802209', '373795', '373796');


DO $$ begin raise notice '{i} finished'; end; $$;
    """

    final_query = final_query + "\n" + query
    counter += 1

print(final_query)  # open in a text editor then copy in PgAdmin to run


## In memory using python
If you choose to create your own INDEX_ALL table please make sure you structure the data per format described earlier. The _create_index_all function will use in memory data to create the table.

In [None]:
def create_index_all(indications:list, diagnosis=None, lab_result_cm=None, prescribing=None):
    '''create_index_all function is the alternative method to create the index_all table to compile a list of patients with at least one COVID-19 indication.

    Args:
        indications (list): list of COVID-19 indication types. Allowed choices are 'diagnosis', 'lab', 'paxlovid'
        diagnosis (pd.DataFrame, optional): the DIAGNOSIS table from PCORnet CDM. Must be provided if 'diagnosis' is in the indications list. Defaults to None.
        lab_result_cm (pd.DataFrame, optional): the LAB_RESULT_CM table from PCORnet CDM. Must be provided if 'lab' is in the indications list. Defaults to None.
        prescribing (pd.DataFrame, optional): the PRESCRIBING table from PCORnet CDM. Must be provided if 'paxlovid' is in the indications list. Defaults to None.

    Returns:
        pd.DataFrame: A dataframe including all patients who have at least one of the COVID-19 indications.
    '''
    # ensure all input choices for indications are correct 
    for i in indications:
        assert i in ['diagnosis', 'lab', 'paxlovid'], f"{i} is not in the acceptable indications choicse of 'diagnosis', 'lab', or 'paxlovid'"

    index_all = pd.DataFrame()
    index_columns = ['site', 'syn_pt_id', 'index_date', 'index_type', 'index_result', 'enc_type']

    ############################
    # pts with U07.1 diagnosis #
    ############################ 
    if 'diagnosis' in indications:
        # ensure the optional argument for diagnosis is passed correctly
        assert type(diagnosis) == type(pd.DataFrame()), f"the input for the diagnosis argument is not a pandas dataframe"

        temp_dx = diagnosis[diagnosis.dx.str.contains('U07.1')].copy()
        temp_dx['index_type'] = 'covid_dx'
        temp_dx['index_result'] = np.nan

        temp_dx.rename(columns={
            'admit_date': 'index_date'
        }, inplace=True)

        temp_dx = temp_dx[index_columns]

        index_all = pd.concat([index_all, temp_dx])

    #########################
    # pts with positive lab #
    #########################
    if 'lab' in indications:
        # ensure the optional argument for lab_result is passed correctly
        assert type(lab_result_cm) == type(pd.DataFrame()), f"the input for the lab_result argument is not a pandas dataframe"
        
        lab_loinc_values = [
            "94306-8", "94307-6", "94308-4", "94309-2", "94310-0"
            , "94311-8", "94312-6", "94313-4", "94314-2", "94315-9"
            , "94316-7", "94500-6", "94502-2", "94509-7", "94510-5"
            , "94511-3", "94531-1", "94532-9", "94533-7", "94534-5"
            , "94558-4", "94559-2", "94565-9", "94640-0", "94641-8"
            , "94642-6", "94643-4", "94645-9", "94646-7"
        ]
        lab_positive = ['POSITIVE', 'DETECTED', 'PRESUMPTIVE POSITIVE']

        temp_lab['index_type'] = 'lab'
        temp_lab['index_result'] = np.where(temp_lab.result_qual.str.contains('|'.join(lab_positive)), 'positive', 'negative')
        temp_lab['enc_type'] = np.nan

        temp_lab.rename(columns={
            'result_date': 'index_date'
        },inplace=True)
        temp_lab = temp_lab[index_columns]

        index_all = pd.concat([index_all, temp_lab], ignore_index=True)

    #####################
    # pts with paxlovid #
    #####################
    # TODO: adding a serach criteria to find paxlovid as COVID-19 indication using prescirbing table

    # convert data type
    index_all["index_date"] = pd.to_datetime(index_all["index_date"]).dt.date

    return index_all

# Pre-defined functions

In [None]:
def get_lab_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_lab_pts finds the list of all patients with at least one positive COVID-19 PCR or antigen lab 

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one COVID-19 PCR or antigen test
    '''

    # at least 1 positive PCR or antigen test
    covid_lab = index_all.query(
        "index_type == 'lab' and index_result == 'positive'")
    covid_lab = list(set(covid_lab[patid_column]))

    return covid_lab


In [None]:
def get_ip_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_ip_dx_pts finds the list of all patients with at least one COVID-19 dx in an inpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one dx in an inpatient setting
    '''

    covid_ip = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['IP', 'EI']))]

    covid_ip = list(set(covid_ip[patid_column]))

    return covid_ip


In [None]:
def get_av_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_av_dx_pts finds the list of all patients with at least one COVID-19 dx in an outpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one dx in an outpatient setting
    '''

    covid_av = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['AV', 'ED', 'TH', 'OA']))]

    covid_av = list(set(covid_av[patid_column]))

    return covid_av


In [None]:
def get_two_av_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_two_av_dx_pts finds the list of all patients with at least two COVID-19 dx in an outpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least two dx in an outpatient setting
    '''

    covid_av_two = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['AV', 'ED', 'TH', 'OA']))]

    # count the number of outpatient dx per patient
    covid_av_two = covid_av_two[[patid_column, 'index_date']].groupby(
        patid_column).nunique().reset_index()
    # patients with at least 2 outpatient dx
    covid_av_two = covid_av_two[covid_av_two['index_date'] >= 2]
    covid_av_two = list(set(covid_av_two[patid_column]))

    return covid_av_two


In [None]:
def get_paxlovid_pts(index_all:pd.DataFrame, patid_column='syn_pt_id'):
    '''get_paxlovid_pts finds the list of all patients with at least one paxlovid prescription

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at one paxlovid prescription
    '''

    covid_paxlovid = index_all[index_all.index_type=='paxlovid']
    covid_paxlovid = list(set(covid_paxlovid[patid_column]))

    return covid_paxlovid

In [None]:
def get_index_event(df: pd.DataFrame, index_date_column='index_date', patid_column='syn_pt_id', start_date=study_start_date, end_date=study_end_date):
    '''get_index_event function finds the first instance of an index event per patient

    Args:
        df (pd.DataFrame): a dataframe with all instances of covid indication for all patients (i.e. positive lab, dx, and etc.)
        index_date_column (str, optional): the column in the dataframe indicating the date. Defaults to 'index_date'.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.
        start_date (str, optional): start of the study period. Defaults to study_start_date variable.
        end_date (str, optional): end of the study period. Defaults to study_end_date variable.

    Returns:
        pd.DataFrame: returns a dataframe with one row per patient inidicating the first instance of the index event
    '''

    start_date = pd.to_datetime(start_date).date()
    end_date = pd.to_datetime(end_date).date()

    index = df[(df[index_date_column] >= start_date)
               & (df[index_date_column] <= end_date)]

    index = index.sort_values(index_date_column).drop_duplicates(patid_column)
    index.reset_index(drop=True, inplace=True)

    return index


# Implementation

In [None]:
# main folder of the query
main_path = ""

# where the raw data will be saved
source_data_path = f"{main_path}/source data"
# create an empty folder if it does not exist
if os.path.exists(source_data_path) != True:
    os.makedirs(source_data_path)

# where the results will be saved
result_path = f"{main_path}/result"
# create an empty folder if it does not exist
if os.path.exists(result_path) != True:
    os.makedirs(result_path)

# where all external data needed for analysis is already saved (e.g. PASC definition spreadsheet)
external_source_path = ""


In [None]:
# a list of site names used in the analysis
# the site names should exactly match the schema names live in the database
site_names = ['site1', 'site2', 'site3', 'site4', 'site5']

# Study period start and end date (YYYY-MM-DD)
study_start_date = '2020-03-01'
study_end_date = '2022-07-30'


In [None]:
# Read the necessary tables if the tables are not already loaded in your environment 
# covid_index_all = pd.read_csv("")
# covid_index_all = pd.read_parquet("")

In [None]:
# Create the table in memory. You must load diagnosis and lab_result_cm tables.
# covid_index_all = create_index_all(
#     indications=['diagnosis', 'lab'],
#     diagnosis=diagnosis,
#     lab_result_cm=lab_result_cm
# )

Using the pre-defined functions in this notebook, you can find the list of patients who are identified as COVID-19 patients, and subsequently the first instance of COVID-19 infection (i.e. index date).

Please note, definition of a COVID-19 patient may varry depending on the query specification at hand. COVID-19 definition will often utilizes a combination of lab, diagnosis, and/or paxlovid prescription.

In this example, we assume the COVID-19 patient is defined as anyone with:
* at least one positive PCR or antigen COVID-19 lab test
**OR**
* at least one COVID-19 diagnosis of U07.1 in an inpatient setting
**OR**
* at least two COVID-19 diagnoses of U07.1 in an outpatient setting in two different days

In [None]:
# exclude any COVID-19 indication outside of the study period time interval
covid_index_all = covid_index_all[
    (covid_index_all['index_date'] >= pd.to_datetime(study_start_date).date())
    &
    (covid_index_all['index_date'] <= pd.to_datetime(study_end_date).date())
    ]

In [None]:
# find patients with at least two outpatient covid dx
covid_av_two = get_two_av_dx_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one inpatient covid dx
covid_ip = get_ip_dx_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one covid lab
covid_lab = get_lab_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)


In [None]:
# Filtering the dataframe to include only diagnoses and positive lab instances
# for patients who meet the COVID-19 definition described above
input_index_all_df = covid_index_all[
    # patients who meet any of the COVID-19 definition described above
    (covid_index_all.syn_pt_id.isin(covid_av_two + covid_lab + covid_ip))
    # excluding index events the COVID-19 indication is paxlovid
    & ~(covid_index_all.index_type == 'paxlovid')
    # excluding index events where the lab result is negative
    & ~((covid_index_all.index_type == 'lab') & (covid_index_all.index_result == 'negative'))
]

# using get_index_event to find the first instance of COVID-19 indication for each patient
index = get_index_event(
    df=input_index_all_df,
    index_date_column='index_date',
    patid_column='syn_pt_id',
    start_date=study_start_date,
    end_date=study_end_date
)

del input_index_all_df
