# Cohort identification 
1. encounters data till cutoff 
2. dementia and MCI 
3. count number of unique patients
4. count number of patients aged older than 50
5. count number of patients who had opioid medication use before the diagnosis

## environment setting

In [None]:
## load libraries 
# Import standard Python libraries
import getpass  
import re 
import json 
import sys  

# Import data analysis and visualization libraries
import pandas as pd 
import numpy as np  
import seaborn as sns  
import matplotlib.pyplot as plt  

# Import datetime utilities
from datetime import datetime, timedelta  

# Import libraries for Google BigQuery
import pandas_gbq as pgbq  
from google.cloud import bigquery  

# Import operating system utilities
import os  

# Import SQLAlchemy for database connection
from sqlalchemy import create_engine  



In [None]:
# Configure BigQuery project and environment
project_id = ''  # BigQuery project ID
client = bigquery.Client(project=project_id)  # Initialize BigQuery client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''  # Authentication credentials
os.environ['GCLOUD_PROJECT'] = project_id  # Set the project environment

# Define database and datasets
db = ""  # Main database
stanford_ds = ""  # Stanford dataset
yh_ds = ""  # Custom dataset

In [None]:
## custom functions 
# Save a query result to a BigQuery table
def save_table(project_id, yh_ds, new_table_name, query):
    table_id = f"{project_id}.{yh_ds}.{new_table_name}"  # Full table path
    job_config = bigquery.QueryJobConfig(destination=table_id)
    job_config.write_disposition = "WRITE_TRUNCATE"  # Overwrite the table if it exists
    query_job = client.query(query, job_config=job_config)  # Run the query
    query_job.result()  # Wait for the query job to complete
    print(f"Query results loaded to the table {table_id}")

# Load a table from BigQuery into a Pandas DataFrame
def load_pgbq(project_id, yh_ds, table_name):
    sql_query = f"SELECT * FROM {project_id}.{yh_ds}.{table_name}"  # Query to fetch all rows
    return_df = pgbq.read_gbq(sql_query, dialect="standard")  # Load table as DataFrame
    print(f"{project_id}.{yh_ds}.{table_name} is loaded")
    return return_df

# Upload a Pandas DataFrame to BigQuery
def upload_pgbq(project_id, yh_ds, table_name, df):
    table_id = f"{yh_ds}.{table_name}"  # Target table path
    pgbq.to_gbq(df, table_id, project_id=project_id, if_exists='replace')  # Upload DataFrame
    print(f"DataFrame is uploaded as {project_id}.{yh_ds}.{table_name}")

# Remove a BigQuery table
def remove_table(project_id, yh_ds, table_name):
    table_id = f"{project_id}.{yh_ds}.{table_name}"  # Full table path
    client.delete_table(table_id, not_found_ok=True)  # Delete the table if it exists
    print(f"Deleted table '{table_id}'.")

## 1. encounter table til 2024-07-31 

In [None]:
# Set the cutoff date for diagnosis and death
cut_off_date = '2024-07-31'
shc_encounter = 'shc_encounter'
# select for patients who have dementia 
# collect patient information on  age at diagnosis, sex, birth date, ethnicity, death date, deceased status, race
# collect diagnosis information on id, name, ICD-10 code, and date 
# death date was collected both from patient table and external death date (with a priority at patient table death date) 
sql_query = f"""
SELECT *
FROM `{project_id}.{stanford_ds}.{shc_encounter}`
WHERE contact_date <= DATE('{cut_off_date}');
"""

#save raw dementia patient table 
table_name = "encounter_07312024"
save_table(project_id, yh_ds, table_name, sql_query)

## 2. patients with dementia / mild cognitive impairment 

In [None]:
## custom regex functions 
# Generate a regex pattern for a single ICD-10 code
def generate_regex_icd10(ancestor): 
    if '.' in ancestor:  # If the code contains a decimal point
        split = ancestor.split(".")  # Split into major and minor parts
        regex_string = f"{split[0]}\\.{split[1]}(\\d+)?"  # Match with optional digits after the minor part
    else:
        regex_string = f"{ancestor}(\\.\\d+)?"  # Match with optional decimal and digits
    return regex_string

# Generate a combined regex pattern for a list of ICD-10 codes
def generate_sql_regex_icd10s(icd10_list):
    string = '^('  # Start regex with ^
    for icd10_index in range(len(icd10_list)): 
        icd10 = icd10_list[icd10_index]
        pattern = generate_regex_icd10(icd10)  # Get regex for the current ICD-10 code
        string += pattern 
        if icd10_index < (len(icd10_list) - 1):  # Add '|' if it's not the last code
            string += '|'
        else:  # Close the regex pattern
            string += ')$'
    return string 

In [None]:
# List of ICD-10 codes to generate regex for
icd10_list = ['F00', 'F01', 'F02', 'F03', 'G30', 'F05.1', 'G31.1', 'G31.84']

# Generate the combined regex string
regex_string = generate_sql_regex_icd10s(icd10_list)

In [None]:
# Set the cutoff date for diagnosis and death
# select for patients who have dementia 
# collect patient information on  age at diagnosis, sex, birth date, ethnicity, death date, deceased status, race
# collect diagnosis information on id, name, ICD-19 code, and date 
# death date was collected both from patient table and external death date (with a priority at patient table death date) 
sql_query = f"""
    SELECT 
        pat.pat_deid,  -- Patient ID
        DATE_DIFF(DATE(diag.start_date), DATE(pat.birth_date), YEAR) AS age_at_diagnosis,  -- Calculate age at diagnosis
        pat.sex,  -- Patient's sex
        DATETIME(pat.birth_date) AS birth_date,  -- Patient's birth date
        pat.ethnic_group,  -- Patient's ethnic group
        COALESCE(
    SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', CAST(pat.death_date_ssa AS STRING)),
    SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', CAST(pat.death_date AS STRING)),
    SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', CAST(ext.ext_death_date AS STRING))
) AS death_date,  -- Use external death date if internal is missing
        COALESCE(pat.deceased, pat.deceased_epic) AS deceased,  
        pat.race,  -- Patient's race
        diag.dx_id,  -- Diagnosis ID
        diag.dx_name,  -- Diagnosis name
        diag.icd10,  -- Diagnosis ICD-10 code
        DATETIME(diag.start_date) AS diagnosis_date  -- Diagnosis date
    FROM `som-nero-phi-boussard.stanfordmed_datalake.shc_diagnosis` AS diag
    JOIN `som-nero-phi-boussard.stanfordmed_datalake.shc_patient` AS pat 
        ON pat.pat_deid = diag.pat_deid  -- Join on patient ID
    LEFT JOIN `som-nero-phi-boussard.stanfordmed_datalake.external_death_dates` AS ext 
        ON pat.pat_deid = ext.pat_deid  -- Include external death dates
    WHERE 
        (
            REGEXP_CONTAINS(diag.icd10, r'{regex_string}')  -- Match ICD-10 codes with regex
            OR REGEXP_CONTAINS(diag.dx_name, r'^(dementia|Dementia)$')  -- Match diagnosis names with regex
        )
        AND DATE(diag.start_date) <= DATE('{cut_off_date}')  -- Filter diagnoses before or on the cutoff date
"""



#save raw dementia patient table 
table_name = "dementia_pat_dementia_diagnosis_07312024"
save_table(project_id, yh_ds, table_name, sql_query)

In [None]:
table_name = "dementia_pat_dementia_diagnosis_07312024"
df_dementia = load_pgbq(project_id, yh_ds, table_name)

In [None]:
## to check for any unexpected inclusion 
# Extract unique ICD-10 codes and disease names
unique_icd10 = df_dementia['icd10'].dropna().unique()  # Remove NaN and get unique ICD-10 codes
unique_dx_names = df_dementia['dx_name'].dropna().unique()  # Remove NaN and get unique disease names

# Convert to a sorted, readable format
unique_icd10 = sorted(unique_icd10)
unique_dx_names = sorted(unique_dx_names)

# # Print the results
# print("Unique ICD-10 Codes:")
# for code in unique_icd10:
#     print(f"- {code}")

# print("\nUnique Disease Names:")
# for name in unique_dx_names:
#     print(f"- {name}")

In [None]:
# Filter dementia DataFrame

# Create a copy of the DataFrame to preserve the original data
filtered_df = df_dementia.copy()

# Remove rows with ICD-10 codes indicating family history (codes containing 'Z')
filtered_df = filtered_df[~filtered_df['icd10'].str.contains('Z', na=False)]

# Remove rows with pseudodementia (diagnoses containing 'Geriatric psychosis')
filtered_df = filtered_df[~filtered_df['dx_name'].str.contains('Geriatric psychosis', na=False)]

# Print the number of unique patients in the original and filtered DataFrame
print("Original unique patients:", df_dementia['pat_deid'].nunique())
print("Filtered unique patients:", filtered_df['pat_deid'].nunique())

## 3. patient aged 50-100

In [None]:
# Filter for elders aged between 50 and 100
df_dementia_elders = filtered_df[
    (filtered_df['age_at_diagnosis'] >= 50) & (filtered_df['age_at_diagnosis'] <= 100)
]

In [None]:
print ("number of unique elderly patients in dementia table is", df_dementia_elders['pat_deid'].nunique())

In [None]:
# Count unique patients with a death date
num_patients_with_death_date = df_dementia_elders[df_dementia_elders['death_date'].notnull()]['pat_deid'].nunique()

# Print the result
print("Number of unique elderly patients with a death date in the dementia table is", num_patients_with_death_date)

get dementia type 
1. AD
2. fronto-temporal
3. vascular
4. lewy body
5. other types 

## 4. Collect type of dementia or MCI

In [None]:
def get_MCI(ICD10, dx_name):
    if 'G31.84' in ICD10:
        return 1
    elif 'mild cognitive impairment' in dx_name.lower():
        return 1 
    elif 'mci' in dx_name.lower():
        return 1 
    else: 
        return 0  

def get_AD(ICD10, dx_name):
    if 'G30' in ICD10:
        return 1
    elif 'alzheimer' in dx_name.lower():
        return 1 
    else: 
        return 0  
def get_FTD(ICD10, dx_name):
    if 'G31.0' in ICD10:
        return 1
    elif 'frontotemporal' in dx_name.lower():
        return 1 
    else: 
        return 0  
def get_VD(ICD10, dx_name):
    if 'F01' in ICD10:
        return 1
    elif 'vascular dementia' in dx_name.lower():
        return 1 
    else: 
        return 0
def get_LBD(ICD10, dx_name):
    if 'G31.83' in ICD10:
        return 1
    elif 'lewy body' in dx_name.lower():
        return 1 
    else: 
        return 0

def get_dementia_type(df): 
    copy_df = df.copy()
    copy_df['MCI'] = copy_df.apply(lambda x: get_MCI(x["icd10"], x["dx_name"]), axis=1)
    copy_df['AD'] = copy_df.apply(lambda x: get_AD(x["icd10"], x["dx_name"]), axis=1)
    copy_df['FTD'] = copy_df.apply(lambda x: get_FTD(x["icd10"], x["dx_name"]), axis=1)
    copy_df['VD'] = copy_df.apply(lambda x: get_VD(x["icd10"], x["dx_name"]), axis=1)
    copy_df['LBD'] = copy_df.apply(lambda x: get_LBD(x["icd10"], x["dx_name"]), axis=1)
    copy_df['other_D'] = np.where((copy_df['MCI'] == 0) & (copy_df['AD'] == 0) & (copy_df['FTD'] == 0) & (copy_df['VD'] == 0) & (copy_df['LBD'] == 0), 1, 0)
    copy_df['death_from_diagnosis'] = (copy_df['death_date'] - copy_df['diagnosis_date']).dt.days
    return copy_df 


In [None]:
df_dementia_elders_types = get_dementia_type(df_dementia_elders)                                               

In [None]:
df_dementia_elders_types.columns

## 5. get patient-level dementia/MCI table 

In [None]:
id_columns = ['pat_deid']  # Identifier column
keep_columns = ['sex', 'birth_date', 'ethnic_group', 'death_date', 'deceased', 'race']  # Columns to retain without aggregation
min_columns = ['diagnosis_date', 'age_at_diagnosis']  # to get the age at first diagnosis and first diagnosis date 
max_columns = ['MCI', 'AD', 'FTD', 'VD', 'LBD', 'other_D', 'death_from_diagnosis']  # checking for at least one diagnosis # death from the first diagnosis 

# Create an aggregation dictionary
agg_dict = {col: 'min' for col in min_columns}  # Assign 'min' aggregation for `min_columns`
agg_dict.update({col: 'max' for col in max_columns})  # Assign 'max' aggregation for `max_columns`
agg_dict


In [None]:
# Aggregate data for elders by 'pat_deid' using the defined aggregation rules
df_dementia_elders_agg = df_dementia_elders_types.groupby('pat_deid', as_index=False).agg(agg_dict)

In [None]:
# Extract unique patient information (IDs and key demographic columns)
df_dementia_pat = df_dementia_elders[id_columns + keep_columns].drop_duplicates()

# Merge aggregated diagnosis data with patient demographic data
df_dementia_first_diagnosis = df_dementia_pat.merge(
    df_dementia_elders_agg,  # Aggregated data
    on='pat_deid',           # Join on patient ID
    how='left'               # Keep all rows from `df_dementia_pat`
)

# Print the shape of the resulting DataFrame
print("Shape of the merged DataFrame:", df_dementia_first_diagnosis.shape)
# Count missing values for each column
missing_values_count = df_dementia_first_diagnosis.isnull().sum()

# Print the result
print("Number of missing values in each column:")
print(missing_values_count)

In [None]:
# Initialize the BigQuery client
client = bigquery.Client()

# Define the table name and full table ID
table_name = 'dementia_pat_first_diagnosis_07312024'
table_id = f"{yh_ds}.{table_name}"  # Dataset and table name in BigQuery format

# Upload the DataFrame to BigQuery
pgbq.to_gbq(
    df_dementia_first_diagnosis,  # DataFrame to upload
    table_id,                     # Destination table ID
    project_id=project_id,        # BigQuery project ID
    if_exists='replace'           # Replace the table if it already exists
)

In [None]:
# ## last encounter date and number of encounters 
# # Define dataset and table names
# stanford_ds = "stanfordmed_datalake"
# yh_ds = "YH_dementia"
# yh_encounter = "encounter_07312024"
# yh_cohort = "dementia_pat_first_diagnosis_07312024"
# yh_new_cohort = "dementia_pat_last_encounter_07312024"

# # SQL query to create or replace the new table with last encounter date and BMI
# sql_query = f"""
# CREATE OR REPLACE TABLE {project_id}.{yh_ds}.{yh_new_cohort} AS

# SELECT 
#     pat.*,  -- Include all columns from the patient cohort
#     vc.visits_before,  -- Number of visits before the diagnosis date
#     vc.visits_after,  -- Number of visits after the diagnosis date
#     vc.last_encounter_date,  -- last encounter date before the cut off date 
#     bmi_data.last_bmi_before_diagnosis  -- Most recent BMI before diagnosis
# FROM `{project_id}.{yh_ds}.{yh_cohort}` AS pat  -- Patient cohort as the main table

# LEFT JOIN (
#     -- Subquery to calculate visits before/after diagnosis and last encounter date
#     SELECT 
#         enc.pat_deid,  -- Patient ID
#         SUM(CASE WHEN TIMESTAMP(enc.contact_date) < TIMESTAMP(cohort.diagnosis_date) THEN 1 ELSE 0 END) AS visits_before,  -- Count visits before diagnosis
#         SUM(CASE WHEN TIMESTAMP(enc.contact_date) >= TIMESTAMP(cohort.diagnosis_date) THEN 1 ELSE 0 END) AS visits_after,   -- Count visits after diagnosis
#         MAX(CASE WHEN TIMESTAMP(enc.contact_date) <= TIMESTAMP('{cut_off_date}') THEN enc.contact_date END) AS last_encounter_date  -- Last encounter date before the cutoff
#     FROM `{project_id}.{yh_ds}.{yh_encounter}` AS enc  -- Encounter data
#     JOIN `{project_id}.{yh_ds}.{yh_cohort}` AS cohort  -- Join with patient cohort to access diagnosis date
#         ON enc.pat_deid = cohort.pat_deid
#     GROUP BY enc.pat_deid, cohort.diagnosis_date  -- Group by patient ID and diagnosis date
# ) AS vc ON pat.pat_deid = vc.pat_deid  -- Join the aggregated visit data to the patient table

# LEFT JOIN (
#     -- Subquery to get the last BMI value before diagnosis
#     SELECT 
#         ranked.pat_deid,  -- Patient ID
#         ranked.bmi AS last_bmi_before_diagnosis  -- Last BMI value before diagnosis
#     FROM (
#         -- Subquery to rank BMI records by contact date
#         SELECT 
#             enc_sub.pat_deid,  -- Patient ID
#             enc_sub.bmi,  -- BMI value
#             enc_sub.contact_date,  -- Contact date for the BMI record
#             ROW_NUMBER() OVER (PARTITION BY enc_sub.pat_deid ORDER BY TIMESTAMP(enc_sub.contact_date) DESC) AS rnk  -- Rank rows by most recent contact date
#         FROM `{project_id}.{yh_ds}.{yh_encounter}` AS enc_sub  -- Encounter data
#         WHERE enc_sub.bmi IS NOT NULL  -- Include only rows with BMI values
#         AND TIMESTAMP(enc_sub.contact_date) < (
#             SELECT MIN(cohort_sub.diagnosis_date)  -- Get the earliest diagnosis date for the patient
#             FROM `{project_id}.{yh_ds}.{yh_cohort}` AS cohort_sub
#             WHERE cohort_sub.pat_deid = enc_sub.pat_deid
#         )
#     ) AS ranked
#     WHERE ranked.rnk = 1  -- Select only the most recent BMI record
# ) AS bmi_data ON pat.pat_deid = bmi_data.pat_deid;  -- Join the last BMI data to the patient table
# """

# # Execute the query
# query_job = client.query(sql_query)
# query_job.result()


## 6. get opioid record 

In [None]:
# # get opioid rxnorm id 
# stanford_ds = "stanfordmed_datalake"
# yh_ds = "YH_dementia"
# shc_mapping = "shc_medication_rxnorm_map"
# yh_op_map = "opioid_rxnorm"
# sql_query = f"""
# SELECT 
#     map.med_id, 
#     op_rxnorm.RxNorm_Code, 
#     op_rxnorm.Name 
# FROM {db}.{stanford_ds}.{shc_mapping} map 
# JOIN {db}.{yh_ds}.{yh_op_map} op_rxnorm ON map.rxcui = op_rxnorm.RxNorm_Code
# """

# table_name = "opioid_med_id"
# save_table(project_id, yh_ds, table_name, sql_query)

In [None]:

## SQL query to extract opioid medication records for dementia patients
# Define datasets and table names
stanford_ds = "stanfordmed_datalake"
yh_ds = "YH_dementia"
op_med_id = "opioid_med_id"
dem_patients = "dementia_pat_first_diagnosis_07312024"
med_records = "shc_medication"

# opioid start date from dementia diagnosis - helps us to get the exposure status before or after the diagnosis 

sql_query = f"""
SELECT 
    dm_pat.*,  -- All patient data from dementia_pat_last_encounter
    
    -- Days from opioid start and diagnosis (if negative, exposure before diagnosis) 
    CASE 
        WHEN op_med.order_start_time IS NULL THEN NULL
        ELSE DATE_DIFF(DATE(op_med.order_start_time), DATE(dm_pat.diagnosis_date), DAY)
    END AS op_exposure_from_diagnosis_days,

    -- Days from diagnosis to opioid start when order was started after the diagnosis 
    CASE 
        WHEN op_med.order_start_time IS NULL THEN NULL
        WHEN DATE(op_med.order_start_time) >= DATE(dm_pat.diagnosis_date) THEN DATE_DIFF(DATE(op_med.order_start_time), DATE(dm_pat.diagnosis_date), DAY)
        ELSE NULL
    END AS post_onset_op_exposure_days,

    -- Days between opioid exposure and death when order was started after the diagnosis 
    CASE 
        WHEN dm_pat.death_date IS NULL OR op_med.order_start_time IS NULL THEN NULL
        WHEN DATE(op_med.order_start_time) >= DATE(dm_pat.diagnosis_date) AND DATE(op_med.order_start_time) <= DATE(dm_pat.death_date) AND DATE(dm_pat.diagnosis_date) <= DATE(dm_pat.death_date) THEN DATE_DIFF(DATE(dm_pat.death_date), DATE(op_med.order_start_time), DAY)
        ELSE NULL
    END AS post_onset_post_opioid_death_days,

    -- Classify opioid strength
    CASE 
        WHEN LOWER(op_med.ingredient) IN ('hydrocodone', 'codeine', 'tramadol') THEN 1  -- Weak opioids
        WHEN LOWER(op_med.ingredient) IN ('fentanyl', 'hydromorphone', 'oxycodone', 'morphine', 'methadone', 'meperidine', 'buprenorphine') THEN 2  -- Strong opioids
        ELSE 0  -- Non-opioids or undefined
    END AS opioid_strength_classification,

    -- Medication details
    op_med.order_deid,
    op_med.pat_enc_csn_deid,
    op_med.med_id,
    op_med.ingredient, 
    op_med.medication_name,
    op_med.ordering_date, 
    op_med.order_start_time,
    op_med.order_end_time,
    op_med.med_route, 
    op_med.ordering_mode, 
    op_med.order_class,
    op_med.is_administered
FROM {db}.{yh_ds}.{dem_patients} dm_pat  -- Dementia patients 
LEFT JOIN (
    -- Join to extract opioid medication records
    SELECT 
        med.pat_deid,
        med_id.med_id, 
        med_id.Name AS ingredient, 
        med.medication_name AS medication_name,
        med.order_deid, 
        med.pat_enc_csn_deid, 
        med.ordering_date, 
        med.order_start_time,
        med.order_end_time,
        med.med_route, 
        med.ordering_mode, 
        med.order_class, 
        med.is_administered
    FROM {db}.{yh_ds}.{op_med_id} med_id
    INNER JOIN {db}.{stanford_ds}.{med_records} med ON med_id.med_id = med.medication_id  -- Link opioid ID to medication records
) op_med 
ON dm_pat.pat_deid = op_med.pat_deid
WHERE op_med.order_start_time >= DATE_SUB(DATE(dm_pat.diagnosis_date), INTERVAL 1 YEAR)  -- Opioids within 1 year before diagnosis
  AND op_med.order_start_time <= '{cut_off_date}';  -- Opioids before the cutoff date
"""



new_table_name = "dementia_op_med_07312024"
save_table(project_id, yh_ds, new_table_name, sql_query )

### medication record feature engineering

In [None]:
# Define dataset and table names
stanford_ds = "stanfordmed_datalake"
yh_ds = "YH_dementia"
sql_table = "dementia_op_med_07312024"

# SQL query to classify opioid exposures relative to diagnosis
sql_query = f"""
SELECT *,
    -- Flag for opioid exposure within 1 year before diagnosis
    CASE 
        WHEN op_exposure_from_diagnosis_days >= -365 AND op_exposure_from_diagnosis_days < 0 THEN 1  -- Within 1 year before diagnosis
        ELSE 0  -- Outside the 1-year window
    END AS exposure_within_1_year, 

    -- Flag for opioid exposure after diagnosis
    CASE 
        WHEN op_exposure_from_diagnosis_days >= 0 THEN 1  -- Exposure on or after diagnosis date
        ELSE 0  -- No exposure after diagnosis
    END AS exposure_after,

    -- Start time for opioid exposure after diagnosis
    CASE 
        WHEN DATETIME(order_start_time) >= DATETIME(diagnosis_date) THEN DATETIME(order_start_time)  -- Exposure starts after diagnosis
        WHEN DATETIME(exposure_end_time) >= DATETIME(diagnosis_date) AND DATETIME(order_start_time) <= DATETIME(diagnosis_date) THEN DATETIME(diagnosis_date)  -- Ends after diagnosis but started before
        ELSE NULL  -- No exposure after diagnosis
    END AS post_onset_exposure_start_time,

    -- Start time for opioid exposure before diagnosis
    CASE 
        WHEN DATETIME(order_start_time) <= DATETIME(diagnosis_date) THEN DATETIME(order_start_time)  -- Exposure starts before diagnosis
        ELSE NULL  -- No exposure before diagnosis
    END AS pre_onset_exposure_start_time,

    -- End time for opioid exposure after diagnosis
    CASE 
        WHEN DATETIME(exposure_end_time) >= DATETIME(diagnosis_date) THEN DATETIME(exposure_end_time)  -- Exposure ends after diagnosis
        ELSE NULL  -- No exposure after diagnosis
    END AS post_onset_exposure_end_time,

    -- End time for opioid exposure before diagnosis
    CASE 
        WHEN DATETIME(exposure_end_time) < DATETIME(diagnosis_date) THEN DATETIME(exposure_end_time)  -- Ends before diagnosis
        WHEN DATETIME(exposure_end_time) >= DATETIME(diagnosis_date) AND DATETIME(order_start_time) < DATETIME(diagnosis_date) THEN DATETIME(diagnosis_date)  -- Ends after diagnosis but started before
        ELSE NULL  -- No exposure before diagnosis
    END AS pre_onset_exposure_end_time
FROM (
    -- Subquery to calculate exposure end time
    SELECT *,
        CASE 
            WHEN order_end_time IS NULL THEN DATETIME(order_start_time)  -- Use start time if end time is missing
            ELSE DATETIME(order_end_time)  -- Use end time if available
        END AS exposure_end_time
    FROM {db}.{yh_ds}.{sql_table}  -- Source table with opioid medication records
) AS subquery;
"""


op_med = pgbq.read_gbq(sql_query, dialect="standard")

In [None]:
op_med.columns

In [None]:
# Remove duplicate opioid medication records based on specific columns
op_med2 = op_med.copy()
op_med2 = op_med2.drop_duplicates(
    subset=['pat_deid', 'ingredient', 'order_start_time', 'exposure_end_time']
)

# Calculate the duration of opioid exposure after diagnosis
op_med2['post_onset_duration'] = (
    op_med2['post_onset_exposure_end_time'] - op_med2['post_onset_exposure_start_time'] 
    + pd.to_timedelta(1, unit='D')  # Add 1 day to include both start and end dates
)

# Calculate the duration of opioid exposure before diagnosis
op_med2['pre_onset_duration'] = (
    op_med2['pre_onset_exposure_end_time'] - op_med2['pre_onset_exposure_start_time'] 
    + pd.to_timedelta(1, unit='D')  # Add 1 day to include both start and end dates
)


In [None]:
# Define column groups for aggregation
id_columns = ['pat_deid']  # Identifier column
keep_columns = ['pat_deid','sex', 'birth_date', 'ethnic_group', 'death_date',
       'deceased', 'race', 'diagnosis_date', 'age_at_diagnosis', 'MCI', 'AD',
       'FTD', 'VD', 'LBD', 'other_D', 'death_from_diagnosis']
min_columns = [
    'op_exposure_from_diagnosis_days' , 'post_onset_op_exposure_days', 
    'pre_onset_exposure_start_time', 'post_onset_exposure_start_time'  # Columns to aggregate by minimum
]
max_columns = [
    'pre_onset_exposure_end_time', 'post_onset_exposure_end_time',
    'post_onset_post_opioid_death_days',
    'exposure_within_1_year', # for exposure group 
    'exposure_after', # for exposure group 
]
sum_columns = ['post_onset_duration', 'pre_onset_duration']  # Columns to sum

# Create aggregation dictionary
agg_dict = {}

# Assign 'min' aggregation for min_columns
for col in min_columns:
    agg_dict[col] = 'min'

# Assign 'max' aggregation for max_columns
for col in max_columns:
    agg_dict[col] = 'max'

# Assign 'sum' aggregation for sum_columns
for col in sum_columns:
    agg_dict[col] = 'sum'

# Display the aggregation dictionary
agg_dict

In [None]:
# Aggregate opioid medication data by patient ID
op_med_agg = op_med2.groupby('pat_deid', as_index=False).agg(agg_dict)

In [None]:
np.sum(op_med_agg.post_onset_post_opioid_death_days<=14)

In [None]:
# Define the table name
table_name = "dementia_pat_first_diagnosis_07312024"

# Load the table data from BigQuery into a pandas DataFrame
df_dementia_first_diagnosis = load_pgbq(project_id, yh_ds, table_name)

In [None]:
df_dementia_first_diagnosis.columns

In [None]:
# Print the number of rows in the DataFrame before dropping rows with null values in 'last_encounter_date'
print(df_dementia_first_diagnosis.shape[0])

In [None]:
# Create a DataFrame with only the specified columns and remove duplicate rows
keep_df = df_dementia_first_diagnosis[keep_columns].drop_duplicates()

# Merge the 'keep_df' DataFrame with the aggregated opioid medication data ('op_med_agg')
# Join is performed on 'pat_deid' with a left join to retain all rows from 'keep_df'
merged_df = keep_df.merge(op_med_agg, on='pat_deid', how='left')

# Fill missing values in specific columns with 0
merged_df[['exposure_within_1_year', 'exposure_after', 'post_onset_duration', 'pre_onset_duration']] = (
    merged_df[['exposure_within_1_year', 'exposure_after', 'post_onset_duration', 'pre_onset_duration']].fillna(0)
)

# Display all column names in the merged DataFrame
merged_df.columns


In [None]:
# Define a function to label exposure groups based on opioid exposure timing
def label_exposure_group(df):
    within_1yr_col = 'exposure_within_1_year'  # Column indicating exposure within 1 year before diagnosis
    after_col = 'exposure_after'  # Column indicating exposure after diagnosis

    # Assign labels based on the conditions
    if (df[within_1yr_col] != 1) & (df[after_col] == 1):
        return 'new user'  # No exposure within 1 year before diagnosis but exposure after diagnosis
    elif (df[within_1yr_col] == 1) & (df[after_col] == 1):
        return 'consistent user'  # Exposure both within 1 year before and after diagnosis
    elif (df[within_1yr_col] != 1) & (df[after_col] != 1):
        return 'control'  # No exposure before or after diagnosis
    elif (df[within_1yr_col] == 1) & (df[after_col] != 1):
        return 'discontinued'  # Exposure within 1 year before diagnosis but no exposure after
    else:
        return None  # Fallback for unexpected cases

# Apply the labeling function to each row of the DataFrame
merged_df = merged_df.assign(
    exposure_group=merged_df.apply(label_exposure_group, axis=1)  # Create a new 'exposure_group' column
)

    

In [None]:
print (merged_df.exposure_group.value_counts())
missing_values = merged_df['exposure_group'].isnull().sum()

print(f"Number of missing values in 'exposure_group': {missing_values}")

In [None]:
np.sum(merged_df[merged_df.exposure_group.isin(['consistent user','new user'])].post_onset_post_opioid_death_days<=14)

In [None]:
merged_df['pre_onset_duration'] = pd.to_timedelta(merged_df['pre_onset_duration'])
merged_df['post_onset_duration'] = pd.to_timedelta(merged_df['post_onset_duration'])

# Round up 'pre_diagnosis_duration' to the nearest day
merged_df['pre_onset_duration'] = merged_df['pre_onset_duration'].apply(lambda x: x.ceil('D'))

# Round up 'post_diagnosis_duration' to the nearest day
merged_df['post_onset_duration'] = merged_df['post_onset_duration'].apply(lambda x: x.ceil('D'))

# Extract the number of days from 'pre_diagnosis_duration'
merged_df['pre_onset_duration'] = merged_df['pre_onset_duration'].dt.days

# Extract the number of days from 'post_diagnosis_duration'
merged_df['post_onset_duration'] = merged_df['post_onset_duration'].dt.days


In [None]:
upload_pgbq(project_id, yh_ds, 'dementia_pat_exposure_group_07312024', merged_df)