In [None]:
###### import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tableone import TableOne
import getpass, re, json, sys
from datetime import datetime, timedelta
import pandas_gbq as pgbq
# Load packages for Big Query 
from google.cloud import bigquery
import os
from sqlalchemy import create_engine


%matplotlib inline

In [None]:
# Define configurations for Big Query
project_id = '' # Location of stride datalake
client = bigquery.Client(project=project_id) # Set project to project_id
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
os.environ['GCLOUD_PROJECT'] = "" # specify environment
db = "" # Define the database
stanford_ds = ""
yh_ds = ""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tableone import TableOne
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

In [None]:
def save_table(project_id, yh_ds, new_table_name, query):
    table_id = f"{project_id}.{yh_ds}.{new_table_name}"
    job_config = bigquery.QueryJobConfig(destination=table_id)
    job_config.write_disposition = "WRITE_TRUNCATE"
    # Start the query, passing in the extra configuration.
    query_job = client.query(query, job_config=job_config)  # Make an API request.
    query_job.result()  # Wait for the job to complete.
    print("Query results loaded to the table {}".format(table_id))  
def load_pgbq(project_id, yh_ds, table_name):
    sql_query = f"SELECT * FROM {project_id}.{yh_ds}.{table_name}"
    return_df = pgbq.read_gbq(sql_query, dialect="standard")
    print (f"{project_id}.{yh_ds}.{table_name}", "is loaded") 
    return return_df
def upload_pgbq(project_id, yh_ds, table_name, df):
    table_id = f"{yh_ds}.{table_name}"
    pgbq.to_gbq(df, table_id, project_id=project_id)
    print ("dataframe", df, "is uploaded as", f"{project_id}.{yh_ds}.{table_name}") 
def remove_table(project_id, yh_ds, table_name):
    client = bigquery.Client()
    table_id = f"{project_id}.{yh_ds}.{table_name}"
    client.delete_table(table_id, not_found_ok=True)  # Make an API request.
    print("Deleted table '{}'.".format(table_id))

In [None]:
pat_table = "dementia_pat_complete_cohort_07312024"
surgery_table = "surgeries"
med_table = "dementia_op_med_07312024"
diagnosis_table = "shc_diagnosis"

sql_pneumonia_query = f"""
WITH pneumonia_patients AS (
    SELECT 
        d.pat_deid,
        d.start_date
    FROM `{project_id}.{stanford_ds}.{diagnosis_table}` d
    WHERE LOWER(d.dx_name) LIKE '%pneumonia%'
)

SELECT 
    p.pat_deid,
    -- Check if pneumonia was diagnosed within 7 days before exposure start
    CASE 
        WHEN DATE_DIFF(CAST(p.post_onset_exposure_start_time AS DATE), pneumonia_patients.start_date, DAY) BETWEEN 0 AND 7 THEN 1
        ELSE 0
    END AS pneumonia_before_7days_exposure,

    -- Check if pneumonia was diagnosed after the exposure start
    CASE 
        WHEN DATE_DIFF(pneumonia_patients.start_date, CAST(p.post_onset_exposure_start_time AS DATE), DAY) > 0 THEN 1
        ELSE 0
    END AS pneumonia_after_exposure,

    -- Calculate the date difference between pneumonia diagnosis and exposure start
    CASE
        WHEN DATE_DIFF(pneumonia_patients.start_date, CAST(p.post_onset_exposure_start_time AS DATE), DAY) > 0 
        THEN DATE_DIFF(pneumonia_patients.start_date, CAST(p.post_onset_exposure_start_time AS DATE), DAY)
        ELSE NULL
    END AS pneumonia_days_since_exposure
FROM `{project_id}.{yh_ds}.{pat_table}` p
LEFT JOIN pneumonia_patients 
    ON p.pat_deid = pneumonia_patients.pat_deid;
"""

pneumonia_df = pgbq.read_gbq(sql_pneumonia_query, dialect="standard")

sql_med_query = f"""
SELECT 
    p.pat_deid,  -- Assuming patient_id is the unique identifier for each patient
    MAX(m.opioid_strength_classification) AS opioid_strength_classification,
    ARRAY_AGG(m.ingredient ORDER BY CAST(m.order_start_time AS DATE) ASC)[OFFSET(0)] AS ingredient
FROM 
    `{project_id}.{yh_ds}.{pat_table}` p
JOIN 
    `{project_id}.{yh_ds}.{med_table}` m
    ON p.pat_deid = m.pat_deid  -- Join both tables on patient ID
    AND CAST(p.post_onset_exposure_start_time AS DATE) = CAST(m.order_start_time AS DATE)  -- Ensure the date matches the first exposure date
WHERE 
    m.order_start_time IS NOT NULL  -- Filter to make sure medication data is available
GROUP BY 
    p.pat_deid, CAST(p.post_onset_exposure_start_time AS DATE);
"""


med_df = pgbq.read_gbq(sql_med_query, dialect="standard")

In [None]:
# Aggregation dictionary: specify which columns to aggregate and how
aggregation_dict = {
    'pneumonia_before_7days_exposure': 'max',
    'pneumonia_after_exposure': 'max',
    'pneumonia_days_since_exposure': 'min'
}

# Perform aggregation by 'group' column
pneumonia_ag_df = pneumonia_df.groupby('pat_deid').agg(aggregation_dict).reset_index()
pneumonia_ag_df = pneumonia_ag_df.drop_duplicates(subset = 'pat_deid')

In [None]:
med_df = med_df.drop_duplicates(subset = 'pat_deid')

In [None]:
cohort_df = load_pgbq(project_id, yh_ds, pat_table)

In [None]:
cohort_df = cohort_df.drop_duplicates(subset=['pat_deid'])

In [None]:
cohort_df2 = pd.merge(cohort_df,pneumonia_ag_df, on='pat_deid', how='left')

In [None]:
cohort_df3 = pd.merge(cohort_df2, med_df, on='pat_deid', how='left')

### sanity check

In [None]:
cohort_df3.exposure_group.value_counts()

In [None]:
sanity_check_columns = ['post_onset_post_opioid_death_days', 'post_onset_op_exposure_days', 'death_from_diagnosis']

In [None]:
import matplotlib.pyplot as plt

# Plot a histogram of the distribution
cohort_df3['post_onset_post_opioid_death_days'].hist(bins=15, edgecolor='black')
plt.title("Distribution of 'post_onset_post_opioid_death_days'")
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Fill 'final_followup_date' with 'death_date' if available; otherwise, use 'last_encounter_date'
cohort_df3['final_followup_date'] = cohort_df3['death_date'].fillna(cohort_df3['last_encounter_date'])
cohort_df3['final_followup_date'] = pd.to_datetime(cohort_df3['final_followup_date'], errors='coerce')
cohort_df3['post_onset_exposure_start_time'] = pd.to_datetime(cohort_df3['post_onset_exposure_start_time'], errors='coerce')
# Ensure all date columns are tz-naive for consistent comparisons

cohort_df3['final_followup_date'] = cohort_df3['final_followup_date'].dt.tz_localize(None)
cohort_df3['post_onset_exposure_start_time'] = cohort_df3['post_onset_exposure_start_time'].dt.tz_localize(None)
cohort_df3['diagnosis_date'] = cohort_df3['diagnosis_date'].dt.tz_localize(None)

cohort_df3['post_onset_exposure_start_time'] = pd.to_datetime(cohort_df3['post_onset_exposure_start_time'])
cohort_df3['follow_up_post_diagnosis_first_op_exposure'] = (cohort_df3['final_followup_date'] - cohort_df3['post_onset_exposure_start_time']).dt.days+1


cohort_df3['death_date'] = cohort_df3['death_date'].dt.tz_localize(None)
cohort_df3.shape[0]

In [None]:

# Step 1: Filter out rows where 'final_followup_date' is null
cohort_df4 = cohort_df3[cohort_df3['final_followup_date'].notnull()]
print(f"Step 1: Sample size after filtering for non-null 'final_followup_date': {cohort_df4.shape[0]}")

In [None]:
# Step 1: Remove rows where 'final_followup_date' is missing
cohort_df4 = cohort_df3[cohort_df3['final_followup_date'].notnull()]
print(f"Step 1: Sample size after filtering for non-null 'final_followup_date': {cohort_df4.shape[0]}")

# Step 2: Keep only rows where 'final_followup_date' is on or after 'diagnosis_date'
cohort_df5 = cohort_df4[cohort_df4['final_followup_date'] >= cohort_df4['diagnosis_date']]
print(f"Step 2: Sample size after ensuring 'final_followup_date' >= 'diagnosis_date': {cohort_df5.shape[0]}")

# Step 3: Retain rows where 'death_from_diagnosis' is either missing or has a non-negative value
cohort_df6 = cohort_df5[
    (cohort_df5['death_from_diagnosis'].isnull()) | (cohort_df5['death_from_diagnosis'] >= 0)
]
print(f"Step 3: Sample size after filtering for null or non-negative 'death_from_diagnosis': {cohort_df6.shape[0]}")

# Step 3b: Count deaths within 14 days of opioid exposure in specific groups
count2 = np.sum(
    (cohort_df6['exposure_group'].isin(['consistent user', 'new user'])) & 
    (cohort_df6['post_onset_post_opioid_death_days'] <= 14)
)
print(f"Step 3b: Number of deaths within 14 days of opioid exposure: {count2}")

# Step 4: Exclude rows with invalid follow-up durations for 'new user' or 'consistent user'
cohort_df7 = cohort_df6[
    ~(
        cohort_df6['exposure_group'].isin(['new user', 'consistent user']) & 
        (cohort_df6['post_onset_exposure_start_time'] >= cohort_df6['final_followup_date'])
    )
]
print(f"Step 4: Sample size after excluding invalid follow-up durations: {cohort_df7.shape[0]}")

# Step 5: Exclude rows with invalid death dates for 'new user' or 'consistent user'
cohort_df8 = cohort_df7[
    ~(
        cohort_df7['exposure_group'].isin(['new user', 'consistent user']) & 
        cohort_df7['death_date'].notna() & 
        (cohort_df7['death_date'] <= cohort_df7['post_onset_exposure_start_time'])
    )
]
print(f"Step 5: Sample size after excluding invalid death dates: {cohort_df8.shape[0]}")

# Step 6: Exclude rows with negative 'post_onset_post_opioid_death_days' for 'new user' or 'consistent user'
cohort_df9 = cohort_df8[
    ~(
        cohort_df8['exposure_group'].isin(['new user', 'consistent user']) & 
        cohort_df8['post_onset_post_opioid_death_days'].notna() & 
        (cohort_df8['post_onset_post_opioid_death_days'] < 0)
    )
]
print(f"Step 6: Sample size after excluding negative 'post_onset_post_opioid_death_days': {cohort_df9.shape[0]}")

# Final: Count deaths within 14 days of opioid exposure in specific groups
count3 = np.sum(
    (cohort_df9['exposure_group'].isin(['consistent user', 'new user'])) & 
    (cohort_df9['post_onset_post_opioid_death_days'] <= 14)
)
print(f"Final: Number of deaths within 14 days of opioid exposure: {count3}")


In [None]:
# Step 1: Keep rows where 'death_date' is either missing or on/after '2015-01-01'
cohort_df10 = cohort_df9[
    (cohort_df9['death_date'].isnull()) | 
    (cohort_df9['death_date'] >= '2015-01-01')
]
print(f"Step 1: Sample size after filtering for valid 'death_date': {cohort_df10.shape[0]}")

# Step 2: Keep rows where 'final_followup_date' is on or after '2015-01-01'
cohort_df11 = cohort_df10[cohort_df10['final_followup_date'] >= '2015-01-01']
print(f"Step 2: Sample size after filtering for 'final_followup_date' >= '2015-01-01': {cohort_df11.shape[0]}")

# Print the sample size of the final cohort after all filtering steps
print("Sample size of cohort:", cohort_df11.shape[0])

# Count the number of deaths within 14 days of opioid exposure in 'consistent user' or 'new user' groups
count4 = np.sum(
    (cohort_df11['exposure_group'].isin(['consistent user', 'new user'])) & 
    (cohort_df11['post_onset_post_opioid_death_days'] <= 14)
)

# Print the final count of deaths
print(f"Final: Number of deaths within 14 days of opioid exposure: {count4}")


In [None]:
# Load the surgery data from the specified table
df_surgery = load_pgbq(project_id, yh_ds, surgery_table)

# Step 1: Merge the cohort DataFrame with the surgery DataFrame on 'pat_deid'
df_surgery_combined = pd.merge(cohort_df11, df_surgery, on='pat_deid')

# Step 2: Ensure 'surg_date' and 'death_date' columns are timezone-naive for comparison
df_surgery_combined['surgery_date'] = df_surgery_combined['surg_date'].dt.tz_localize(None)
df_surgery_combined['death_date'] = df_surgery_combined['death_date'].dt.tz_localize(None)

# Step 3: Calculate the time difference (in days) between surgery date and death date
df_surgery_combined['time_to_death_from_surgery'] = (
    df_surgery_combined['death_date'] - df_surgery_combined['surgery_date']
).dt.days

# Step 4: Identify patients who died within 14 days of surgery
patients_to_exclude = df_surgery_combined[
    df_surgery_combined['time_to_death_from_surgery'] <= 14
]['pat_deid']

# Step 5: Exclude these patients from the original cohort DataFrame
df_cohort_surgery_excluded = cohort_df11[
    ~cohort_df11['pat_deid'].isin(patients_to_exclude)
]

print(f"Final cohort size after excluding deaths within 14 days of surgery: {df_cohort_surgery_excluded.shape[0]}")


In [None]:
# Step 1: Print sample size after excluding patients who died within 14 days of surgery
print("Sample size of cohort after exclusion of patients who died within 14 days after surgery:", df_cohort_surgery_excluded.shape[0])

# Step 2: Filter for patients with continuity of care (more than 2 visits before and after dementia diagnosis)
df_continued_care = df_cohort_surgery_excluded[
    (df_cohort_surgery_excluded['visits_before'] > 2) & 
    (df_cohort_surgery_excluded['visits_after'] > 2)
]
print("Sample size of cohort who received continuity of care before and after dementia diagnosis:", df_continued_care.shape[0])

# Step 3: Exclude patients who received their first dementia diagnosis near the time of death (< 7 days)
df_continued_care2 = df_continued_care[
    ~((df_continued_care['death_date'] - df_continued_care['diagnosis_date']).dt.days < 7)
]
print("Sample size of cohort after exclusion of patients who received first dementia diagnosis at the time of death:", df_continued_care2.shape[0])

# Step 4: Count deaths within 14 days of opioid exposure in the filtered cohort
death_count = np.sum(
    (df_continued_care2['exposure_group'].isin(['consistent user', 'new user'])) & 
    (df_continued_care2['post_onset_post_opioid_death_days'] <= 14)
)
print(f"Number of deaths within 14 days of opioid exposure in the filtered cohort: {death_count}")


In [None]:
# Step 1: Create a copy of the DataFrame to avoid modifying the original
df_copy = df_continued_care2.copy()

# Step 2: Ensure 'diagnosis_date' is in datetime format and remove any timezone info
df_copy['diagnosis_date'] = pd.to_datetime(df_copy['diagnosis_date']).dt.tz_localize(None)

# Step 3: Define the cutoff date (July 31, 2024) as a timezone-naive datetime object
cutoff_date = pd.to_datetime('2024-07-31')

# Step 4: Filter the cohort to include only patients diagnosed on or before the cutoff date
df_final = df_copy[df_copy['diagnosis_date'] <= cutoff_date]

# Step 5: Remove duplicate patients based on their unique identifier 'pat_deid'
df_final = df_final.drop_duplicates(subset='pat_deid')

# Step 6: Print the final sample size after filtering
print("Sample size of cohort after exclusion of patients diagnosed after July 2024:", df_final.shape[0])


In [None]:
# get long-term opioid use category 
df_final2 = df_final.copy()
df_final2['pre_longterm_opioid'] = np.where(df_final2['pre_onset_duration'] >= 90, 1, 0)
df_final2['post_longterm_opioid'] = np.where(df_final2['post_onset_duration'] >= 90, 1, 0)
df_final2['pre_longterm_opioid_consistent'] = np.where((df_final2['pre_longterm_opioid'] == 1)&(df_final2['exposure_group'] == 'consistent user') , 1, 0)
df_final2['post_longterm_opioid_consistent'] = np.where((df_final2['post_longterm_opioid'] == 1)&(df_final2['exposure_group'] == 'consistent user') , 1, 0)

In [None]:
df_final2.exposure_group.value_counts()

In [None]:
#df_final2.to_csv('final_cohort_07312024.csv')