In [None]:
# if any of the modules above are not already installed please use the command below in your notebook to install the module
# !pip install NameOfYourModule (e.g. !pip install pandas)

import pandas as pd

import os

# INDEX_ALL table

## Definition
The index_all table is a dataframe that contains all instances of COVID-19 indications. The intended COVID-19 indications are:
1. COVID-19 diagnosis and the setting it was recorded in
2. COVID-19 PCR or atigen test
3. Paxlovid prescription

## Location
This table is already available for majority of the sites live in the database under **qtwg** schema. The script below generates the SQL query to create this table in the database for each site (i.e. qtwg.\<SITE\>_index_all). For example, WCM's index_all table is in **qtwg.wcm_index_all**. 

If you choose to create your own INDEX_ALL table please make sure you structure the data per format below.

## Format
site | syn_pt_id |index_date |index_type | index_result | enc_type
-----|-----|-----|-----|-----|-----

## Dictionary
* site: site's name
* syn_pt_id: unique patient identifier. Combination of site's name and patid column in PCORnet CDM. 
    * \<site\>_\<patid\>. e.g. WCM_123456 
* index_date: the date when COVID-19 indication was recorded
* index_type: the type of COVID-19 indication as stated above. Possible values are
    * **covid_dx** to represent COVID-19 diagnosis (U07.1)
    * **lab** to represent COVID-19 PCR or atigen test
    * **paxlovid** to represent Paxlovid prescription
* index_result: this is only populated if the index_type is lab and NULL for others. Possible values are **positive** or **negative** 
* enc_type: this is only populated if the index_type is covid_dx and NULL for others. Possible values match the enc_type values in the DIAGNOSIS table in the PCORnet CDM 


## SQL query

In [2]:
# this is to dynamically create a set of SQL queries to create permenant tables contianing index events for each site
# the result should be ran in postgres sql client (pgAdmin)
# open in a text editor then copy in PgAdmin to run

counter = 1

final_query = """
--=====================--
--===== index_all =====--
--=====================--
"""

for i in site_names:

    if i == 'ochsner':
        rx_start_date = "to_date(rx_start_date, 'YYYY-MM-DD')"
    else:
        rx_start_date = "rx_start_date"

    query = f"""
--==============================================================================================================--
--============================================== {i} ===================================================--
--==============================================================================================================--
DO $$ begin raise notice '{counter}.processing {i}'; end; $$;


DROP TABLE IF EXISTS qtwg.{i}_index_all;

CREATE TABLE qtwg.{i}_index_all
(
    patid varchar(200),
    enc_type varchar(50),
    index_date date,
    index_type varchar(50),
    index_result varchar(50)
);



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select labs.patid, NULL as enc_type
    , result_date as index_date
    , 'lab' as index_type
    , case when result_qual in ('POSITIVE', 'DETECTED', 'PRESUMPTIVE POSITIVE') then 'positive' else 'negative' end as index_result
from {i}.lab_result_cm labs
join public.covid_tests loinc
    on loinc.concept_code = labs.lab_loinc
    and loinc.type = 'Ag';



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select dx.patid, enc_type
    , admit_date as index_date
    , 'covid_dx' as index_type
    , null as index_result
from {i}.diagnosis dx
where dx in ('U07.1');



INSERT INTO qtwg.{i}_index_all (patid, enc_type, index_date, index_type, index_result)
select patid, NULL as enc_type, {rx_start_date} as rx_start_date
, 'paxlovid' as index_type, NULL as index_result
from {i}.prescribing
where {rx_start_date} >= '2020-03-01' 
and rxnorm_cui in (
  '2587899', '2587898', '2587892', '2587897', '2587893', '2587896', '2587894', '2587895'
, '69108530', '69108506', '2587899', '2587898', '2587892', '2587897', '2587893', '2587896'
, '2587894', '2587895', '85762', '199249', '316647', '317488', '331537', '597728', '746644'
, '1926065', '1926066', '317150', '332450', '900575', '900574', '1802209', '373795', '373796');


DO $$ begin raise notice '{i} finished'; end; $$;
    """

    final_query = final_query + "\n" + query
    counter += 1

print(final_query)  # open in a text editor then copy in PgAdmin to run



--===== index_all =====--


DO $$ begin raise notice '1.processing vumc'; end; $$;


DROP TABLE IF EXISTS qtwg.vumc_index_all;

CREATE TABLE qtwg.vumc_index_all
(
    patid varchar(200),
    enc_type varchar(50),
    index_date date,
    index_type varchar(50),
    index_result varchar(50)
);



INSERT INTO qtwg.vumc_index_all (patid, enc_type, index_date, index_type, index_result)
select labs.patid, NULL as enc_type
    , result_date as index_date
    , 'lab' as index_type
    , case when result_qual in ('POSITIVE', 'DETECTED', 'PRESUMPTIVE POSITIVE') then 'positive' else 'negative' end as index_result
from vumc.lab_result_cm labs
join public.covid_tests loinc
    on loinc.concept_code = labs.lab_loinc
    and loinc.type = 'Ag';



INSERT INTO qtwg.vumc_index_all (patid, enc_type, index_date, index_type, index_result)
select dx.patid, enc_type
    , admit_date as index_date
    , 'covid_dx' as index_type
    , null as index_result
from vumc.diagnosis dx
where dx in ('U07.1');



INSE

# Pre-defined functions

In [None]:
def get_lab_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_lab_pts finds the list of all patients with at least one positive COVID-19 PCR or antigen lab 

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one COVID-19 PCR or antigen test
    '''

    # at least 1 positive PCR or antigen test
    covid_lab = index_all.query(
        "index_type == 'lab' and index_result == 'positive'")
    covid_lab = list(set(covid_lab[patid_column]))

    return covid_lab


In [None]:
def get_ip_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_ip_dx_pts finds the list of all patients with at least one COVID-19 dx in an inpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one dx in an inpatient setting
    '''

    covid_ip = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['IP', 'EI']))]

    covid_ip = list(set(covid_ip[patid_column]))

    return covid_ip


In [None]:
def get_av_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_av_dx_pts finds the list of all patients with at least one COVID-19 dx in an outpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least one dx in an outpatient setting
    '''

    covid_av = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['AV', 'ED', 'TH', 'OA']))]

    covid_av = list(set(covid_av[patid_column]))

    return covid_av


In [None]:
def get_two_av_dx_pts(index_all: pd.DataFrame, patid_column='syn_pt_id'):
    '''get_two_av_dx_pts finds the list of all patients with at least two COVID-19 dx in an outpatient setting

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at least two dx in an outpatient setting
    '''

    covid_av_two = index_all[(index_all['index_type'] == 'covid_dx') & (
        index_all['enc_type'].isin(['AV', 'ED', 'TH', 'OA']))]

    # count the number of outpatient dx per patient
    covid_av_two = covid_av_two[[patid_column, 'index_date']].groupby(
        patid_column).nunique().reset_index()
    # patients with at least 2 outpatient dx
    covid_av_two = covid_av_two[covid_av_two['index_date'] >= 2]
    covid_av_two = list(set(covid_av_two[patid_column]))

    return covid_av_two


In [None]:
def get_paxlovid_pts(index_all:pd.DataFrame, patid_column='syn_pt_id'):
    '''get_paxlovid_pts finds the list of all patients with at least one paxlovid prescription

    Args:
        index_all (pd.DataFrame): a dataframe contianing all COVID-19 indications.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.

    Returns:
        list: all patients with at one paxlovid prescription
    '''

    covid_paxlovid = index_all[index_all.index_type=='paxlovid']
    covid_paxlovid = list(set(covid_paxlovid[patid_column]))

    return covid_paxlovid

In [None]:
def get_index_event(df: pd.DataFrame, index_date_column='index_date', patid_column='syn_pt_id', start_date=study_start_date, end_date=study_end_date):
    '''get_index_event function finds the first instance of an index event per patient

    Args:
        df (pd.DataFrame): a dataframe with all instances of covid indication for all patients (i.e. positive lab, dx, and etc.)
        index_date_column (str, optional): the column in the dataframe indicating the date. Defaults to 'index_date'.
        patid_column (str, optional): the column in the dataframe indicating the patient identifier. Defaults to 'syn_pt_id'.
        start_date (str, optional): start of the study period. Defaults to study_start_date variable.
        end_date (str, optional): end of the study period. Defaults to study_end_date variable.

    Returns:
        pd.DataFrame: returns a dataframe with one row per patient inidicating the first instance of the index event
    '''

    start_date = pd.to_datetime(start_date).date()
    end_date = pd.to_datetime(end_date).date()

    index = df[(df[index_date_column] >= start_date)
               & (df[index_date_column] <= end_date)]

    index = index.sort_values(index_date_column).drop_duplicates(patid_column)
    index.reset_index(drop=True, inplace=True)

    return index


# Implementation

In [None]:
# main folder of the query
main_path = ""

# where the raw data will be saved
source_data_path = f"{main_path}/source data"
# create an empty folder if it does not exist
if os.path.exists(source_data_path) != True:
    os.makedirs(source_data_path)

# where the results will be saved
result_path = f"{main_path}/result"
# create an empty folder if it does not exist
if os.path.exists(result_path) != True:
    os.makedirs(result_path)

# where all external data needed for analysis is already saved (e.g. PASC definition spreadsheet)
external_source_path = ""


In [None]:
# a list of site names used in the analysis
# the site names should exactly match the schema names live in the database
site_names = ['site1', 'site2', 'site3', 'site4', 'site5']

# Study period start and end date (YYYY-MM-DD)
study_start_date = '2020-03-01'
study_end_date = '2022-07-30'


In [None]:
# Read the necessary tables if the tables are not already loaded in your environment 
# covid_index_all = pd.read_csv("")
# covid_index_all = pd.read_parquet("")


Using the pre-defined functions in this notebook, you can find the list of patients who are identified as COVID-19 patients, and subsequently the first instance of COVID-19 infection (i.e. index date).

Please note, definition of a COVID-19 patient may varry depending on the query specification at hand. COVID-19 definition will often utilizes a combination of lab, diagnosis, and/or paxlovid prescription.

In this example, we assume the COVID-19 patient is defined as anyone with:
* at least one positive PCR or antigen COVID-19 lab test
**OR**
* at least one COVID-19 diagnosis of U07.1 in an inpatient setting
**OR**
* at least two COVID-19 diagnoses of U07.1 in an outpatient setting in two different days

In [None]:
# exclude any COVID-19 indication outside of the study period time interval
covid_index_all = covid_index_all[
    (covid_index_all['index_date'] >= pd.to_datetime(study_start_date).date())
    &
    (covid_index_all['index_date'] <= pd.to_datetime(study_end_date).date())
    ]

In [None]:
# find patients with at least two outpatient covid dx
covid_av_two = get_two_av_dx_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one inpatient covid dx
covid_ip = get_ip_dx_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one covid lab
covid_lab = get_lab_pts(
    index_all=covid_index_all,
    patid_column='syn_pt_id'
)


In [None]:
# Filtering the dataframe to include only diagnoses and positive lab instances
# for patients who meet the COVID-19 definition described above
input_index_all_df = covid_index_all[
    # patients who meet any of the COVID-19 definition described above
    (covid_index_all.syn_pt_id.isin(covid_av_two + covid_lab + covid_ip))
    # excluding index events the COVID-19 indication is paxlovid
    & ~(covid_index_all.index_type == 'paxlovid')
    # excluding index events where the lab result is negative
    & ~((covid_index_all.index_type == 'lab') & (covid_index_all.index_result == 'negative'))
]

# using get_index_event to find the first instance of COVID-19 indication for each patient
index = get_index_event(
    df=input_index_all_df,
    index_date_column='index_date',
    patid_column='syn_pt_id',
    start_date=study_start_date,
    end_date=study_end_date
)

del input_index_all_df
