In [None]:
# if any of the modules above are not already installed please use the command below in your notebook to install the module
# !pip install NameOfYourModule (e.g. !pip install pandas)

import pandas as pd
import numpy as np
import datetime

from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from sqlalchemy import text as sqlalchemy_text

import pyarrow.parquet as pq
import pyarrow as pa

import os
import requests

In [None]:
# please make sure the recover_analysis.py is downloaded in your working directory (i.e. main_path) 
# you may download the latest version here: https://raw.githubusercontent.com/recoverEHRinformatics/data-analysis-pipeline/main/recover_analysis.py
url = 'https://raw.githubusercontent.com/recoverEHRinformatics/data-analysis-pipeline/main/recover_analysis.py'
r = requests.get(url, allow_redirects=True)

open(f"{main_path}/recover_analysis.py", 'wb').write(r.content)

import recover_analysis as rp

In [None]:
# a list of site names used in the analysis
# the site names should exactly match the schema names live in the database
site_names = ['site1', 'site2', 'site3', 'site4', 'site5']

# Study period start and end date (YYYY-MM-DD)
study_start_date = '2020-03-01'
study_end_date = '2022-07-30'


In [None]:
# main folder of the query
main_path = ""

# where the raw data will be saved
source_data_path = f"{main_path}/source data"
# create an empty folder if it does not exist
if os.path.exists(source_data_path) != True:
    os.makedirs(source_data_path)

# where the results will be saved
result_path = f"{main_path}/result"
# create an empty folder if it does not exist
if os.path.exists(result_path) != True:
    os.makedirs(result_path)

# where all external data needed for analysis is already saved (e.g. PASC definition spreadsheet)
external_source_path = ""


# Raw data extraction

In [None]:
# replace the empty strings below with the correct server/database information
server = ""
database = ""
username = ""
password = ""
port = ''

database_string = f"postgres+psycopg2://{username}:{password}@{server}:{port}/{database}"

database_engine = create_engine(database_string)

In [None]:
query_demographic = f"""
SELECT
CONCAT({"'SiteSchema'"}, '_', patid) AS syn_pt_id
, t1.patid, t1.birth_date, t1.race, t1.hispanic, t1.sex
FROM SiteSchema.demographic t1
WHERE t1.patid in (
    SELECT patid FROM qtwg.SiteSchema_index_all 
    WHERE index_date >= '{study_start_date}' AND index_date <= '{study_end_date}'
);
"""

demographic = rp.extract_raw_data(
    query=query_demographic, 
    site_names=site_names, 
    source_data_path=source_data_path, 
    data_name='demographic',
    database_engine=database_engine)


del query_demographic


In [None]:
# please note we are limiting the records in this table up to the end of the study period
query_diagnosis = f"""
SELECT
CONCAT({"'SiteSchema'"}, '_', patid) AS syn_pt_id
, t1.patid, t1.admit_date, t1.dx
FROM SiteSchema.diagnosis t1
WHERE t1.patid in (
        SELECT patid FROM qtwg.SiteSchema_index_all 
        WHERE index_date >= '{study_start_date}' AND index_date <= '{study_end_date}'
        )
    AND t1.admit_date <= '{study_end_date}';
"""

diagnosis = rp.extract_raw_data(
    query=query_diagnosis, 
    site_names=site_names, 
    source_data_path=source_data_path, 
    data_name='diagnosis',
    database_engine=database_engine)


del query_diagnosis


In [None]:
# please note we are limiting the records in this table up to the end of the study period
query_index_all = f"""
SELECT
CONCAT({"'SiteSchema'"}, '_', patid) AS syn_pt_id
, t1.patid, t1.index_date, t1.index_type, t1.index_result, t1.enc_type
FROM qtwg.SiteSchema_index_all t1
WHERE t1.index_date >= '{study_start_date}' AND t1.index_date <= '{study_end_date}';
"""

index_all = rp.extract_raw_data(
    query=query_index_all, 
    site_names=site_names, 
    source_data_path=source_data_path, 
    data_name='index_all',
    database_engine=database_engine)


del query_index_all


In [None]:
# exclude any COVID-19 indication outside of the study period time interval
index_all = index_all[
    (index_all['index_date'] >= pd.to_datetime(study_start_date).date())
    &
    (index_all['index_date'] <= pd.to_datetime(study_end_date).date())
    ]

# identify covid patients

In [None]:
# find patients with at least two outpatient covid dx
covid_av_two = rp.get_two_av_dx_pts(
    index_all=index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one inpatient covid dx
covid_ip = rp.get_ip_dx_pts(
    index_all=index_all,
    patid_column='syn_pt_id'
)

# find patients with at least one covid lab
covid_lab = rp.get_lab_pts(
    index_all=index_all,
    patid_column='syn_pt_id'
)


In [None]:
# Filtering the dataframe to include only diagnoses and positive lab instances
temp_index_all = index_all[
    # patients who meet any of the COVID-19 definition described above
    (index_all.syn_pt_id.isin(covid_av_two + covid_lab + covid_ip))
    # excluding index events the COVID-19 indication is paxlovid
    & ~(index_all.index_type == 'paxlovid')
    # excluding index events where the lab result is negative
    & ~((index_all.index_type == 'lab') & (index_all.index_result == 'negative'))
]

# using get_index_event to find the first instance of COVID-19 indication for each patient
index = rp.get_index_event(
    df=temp_index_all,
    index_date_column='index_date',
    patid_column='syn_pt_id',
    start_date=study_start_date,
    end_date=study_end_date
)

del temp_index_all


# identify pasc patients

In [None]:
PASC_definition = pd.read_excel("https://github.com/recoverEHRinformatics/data-analysis-pipeline/blob/main/external%20data%20sources/PASC_subphenotype.xlsx?raw=true")

# please reference the correct column names in your spreadsheet if using a different one
PASC_definition.rename(columns={
    'ICD-10-CM Code_clean': 'i10_code',
    'pasc': 'ccsr_category',
    'PASC Name Simple': 'pasc_name_simple'
}, inplace=True
)

# a flag to filter any diagnosis that does not meet the stringent definition
# you may comment this line out if the spreadsheet you are using already contains the diagnoses of interest
PASC_definition = PASC_definition[PASC_definition['selected stringent'] == 1]


In [None]:
system_crosswalk = {
  "Abdominal pain and other digestive/abdomen signs and symptoms": "Digestive" 
, "Acute phlebitis; thrombophlebitis and thromboembolism": "Circulatory" 
, "Acute pulmonary embolism": "Circulatory" 
, "Anemia": "Blood" 
, "Circulatory signs and symptoms": "Circulatory" 
, "Diabetes mellitus with complication": "Endocrine" 
, "Fever": "General" 
, "Fluid and electrolyte disorders": "Endocrine" 
, "Headache; including migraine": "neurological" 
, "Malaise and fatigue": "neurological" 
, "Malnutrition": "Endocrine" 
, "Musculoskeletal pain, not low back pain": "Musculoskeletal" 
, "Nervous system signs and symptoms": "neurological" 
, "Neurocognitive disorders": "neurological" 
, "Nonspecific chest pain": "Circulatory" 
, "Other general signs and symptoms": "Endocrine" 
, "Other nervous system disorders (neither hereditary nor degenerative)": "neurological" 
, "Other specified and unspecified gastrointestinal disorders": "Digestive" 
, "Other specified and unspecified lower respiratory disease": "Respiratory" 
, "Other specified and unspecified skin disorders": "Skin" 
, "Other specified upper respiratory infections": "Respiratory" 
, "PASC-General": "PASC Diagnosis" 
, "Pressure ulcer of skin": "Skin" 
, "Respiratory signs and symptoms": "Respiratory" 
, "Sleep wake disorders": "neurological"
}

In [None]:
color_crosswalk = {
  "Abdominal pain and other digestive/abdomen signs and symptoms": "brown" 
, "Acute phlebitis; thrombophlebitis and thromboembolism": "crimson" 
, "Acute pulmonary embolism": "crimson" 
, "Anemia": "orange" 
, "Circulatory signs and symptoms": "crimson" 
, "Diabetes mellitus with complication": "lightgreen" 
, "Fever": "lightgrey" 
, "Fluid and electrolyte disorders": "lightgreen" 
, "Headache; including migraine": "skyblue" 
, "Malaise and fatigue": "skyblue" 
, "Malnutrition": "lightgreen" 
, "Musculoskeletal pain, not low back pain": "pink" 
, "Nervous system signs and symptoms": "skyblue" 
, "Neurocognitive disorders": "skyblue" 
, "Nonspecific chest pain": "crimson" 
, "Other general signs and symptoms": "lightgreen" 
, "Other nervous system disorders (neither hereditary nor degenerative)": "skyblue" 
, "Other specified and unspecified gastrointestinal disorders": "brown" 
, "Other specified and unspecified lower respiratory disease": "tan" 
, "Other specified and unspecified skin disorders": "thistle" 
, "Other specified upper respiratory infections": "tan" 
, "PASC-General": "black" 
, "Pressure ulcer of skin": "thistle" 
, "Respiratory signs and symptoms": "tan" 
, "Sleep wake disorders": "skyblue" 
}

In [None]:
# assign an organ system and a color (for visualizations) based on the crosswalks created earlier
PASC_definition['system'] = PASC_definition['ccsr_category'].map(system_crosswalk)
PASC_definition['color'] = PASC_definition['ccsr_category'].map(color_crosswalk)

In [None]:
# by inner joining the table with ALL diagnoses with PASC definition reference spreadsheet
# we are creating a smaller subset of the diagnosis table containing only the PASC like diagnoses
# this step may not be necessary, but will help to optimize the query and its later functions
pasc_diagnoses = pd.merge(
    diagnosis,
    PASC_definition[['i10_code']],
    left_on='dx',
    right_on='i10_code', 
    how='inner'
)
# dropping duplicated column
pasc_diagnoses.drop(columns=('i10_code'), inplace=True)


In [None]:
pasc_diff, pasc_date = rp.get_pasc_category(
    diagnosis=pasc_diagnoses,
    index=index,
    PASC_definition_reference=PASC_definition,
    category='ccsr_category',
    patid_column='syn_pt_id'
)

In [None]:
pasc_subphenotype = rp.get_pasc_subphenotype(
    pasc_diff=pasc_diff,
    patid_column='syn_pt_id'
)

In [None]:
pasc_yn = pasc_diff.copy()

# get a list of all columns (diagnosis categories) avoiding patid column in the first position
col_list_ccsr = pasc_yn.columns[1:]

# if +1 means it's pasc dx
# if -1 means it's existing dx
# if 0 means never been diagnosed
pasc_yn[col_list_ccsr] = pasc_yn[col_list_ccsr].apply(lambda x: [1 if y > 30 else (-1 if y < -7 else 0) for y in x])

In [None]:
pasc_pts = rp.get_pasc_pts(
    index=index,
    pasc_yn=pasc_yn,
    pasc_subphenotype=pasc_subphenotype,
    patid_column='syn_pt_id'
)

# demographic clean up

In [None]:
# calculate age as of today then categorize the age
# please note, for CSC queries you may often need to calcualte age at the time of index event unless stated otherwise
demographic['age_as_of_today'] = (datetime.date.today() - demographic['birth_date']) / np.timedelta64(1, 'Y')
demographic['age_as_of_today_group'] = rp.categorize_age(df=demographic, age_column='age_as_of_today')

# clean SEX column
demographic = rp.clean_sex(df=demographic, sex_column='sex')

# clean RACE and HISPANIC column then categorize patients based on race and ethnicity combined 
demographic = rp.clean_race(df=demographic, race_column='race')
demographic = rp.clean_ethnicity(df=demographic, ethnicity_column='hispanic')
demographic['race_ethnicity'] = rp.categorize_race_ethnicity(df=demographic, race_column='race', ethnicity_column='hispanic')


# analysis

In [None]:
patid_column = 'syn_pt_id'
sex_column = 'sex'

In [None]:
# initiate the flat file with the index table to ensure every patient will have an index date
flat = index[['site', patid_column, 'index_date']].copy()

# inner joining to demographic table to collect demographic information
flat = flat.merge(
    demographic[[patid_column, sex_column, 'race_ethnicity', 'birth_date', 'age_as_of_today', 'age_as_of_today_group']],
    on=patid_column,
    how='inner'
)

# calculate and categorize age at the time of index event
flat['age_as_of_index'] = (flat['index_date'] - flat['birth_date']) / np.timedelta64(1, 'Y')
flat['age_as_of_index_group'] = rp.categorize_age(df=flat, age_column='age_as_of_index')

# inner joining to pasc_pts table that contains PASC information for all patients regardless of their status
flat = flat.merge(
    pasc_pts,
    on=[patid_column, 'index_date'],
    how='inner'
)

In [None]:
flat.pivot_table(
    values=patid_column,
    aggfunc='nunique',
    index=sex_column,
    columns='pasc_yn',
    margins=True
)