In [None]:
# Core libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis tools
from tableone import TableOne

# Utilities for date and time manipulation
from datetime import datetime, timedelta

# BigQuery interaction
import pandas_gbq as pgbq
from google.cloud import bigquery

# Database connection management
from sqlalchemy import create_engine

# Enable inline plotting for Jupyter Notebook
%matplotlib inline


In [None]:
# Define configurations for Big Query
project_id = '' # Location of stride datalake
client = bigquery.Client(project=project_id) # Set project to project_id
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
os.environ['GCLOUD_PROJECT'] = "" # specify environment
db = "" # Define the database
stanford_ds = ""
yh_ds = ""

In [None]:
def save_table(project_id, yh_ds, new_table_name, query):
    table_id = f"{project_id}.{yh_ds}.{new_table_name}"
    job_config = bigquery.QueryJobConfig(destination=table_id)
    job_config.write_disposition = "WRITE_TRUNCATE"
    # Start the query, passing in the extra configuration.
    query_job = client.query(query, job_config=job_config)  # Make an API request.
    query_job.result()  # Wait for the job to complete.
    print("Query results loaded to the table {}".format(table_id))  
def load_pgbq(project_id, yh_ds, table_name):
    sql_query = f"SELECT * FROM {project_id}.{yh_ds}.{table_name}"
    return_df = pgbq.read_gbq(sql_query, dialect="standard")
    print (f"{project_id}.{yh_ds}.{table_name}", "is loaded") 
    return return_df
def upload_pgbq(project_id, yh_ds, table_name, df):
    table_id = f"{yh_ds}.{table_name}"
    pgbq.to_gbq(df, table_id, project_id=project_id)
    print ("dataframe", df, "is uploaded as", f"{project_id}.{yh_ds}.{table_name}") 
def remove_table(project_id, yh_ds, table_name):
    client = bigquery.Client()
    table_id = f"{project_id}.{yh_ds}.{table_name}"
    client.delete_table(table_id, not_found_ok=True)  # Make an API request.
    print("Deleted table '{}'.".format(table_id))

In [None]:
def get_death_duration_column(death_after_exposure,  observation_days): 
    if pd.isna(death_after_exposure):
        return observation_days
    elif death_after_exposure > observation_days: # when the patient died after the observation cut off 
        return observation_days 
    elif death_after_exposure <= 0:
        return 0
    else:
        return death_after_exposure 
def get_death_outcome_column(death_after_exposure, observation_days): 
    if pd.isna(death_after_exposure):
        return False
    elif death_after_exposure > observation_days: 
        return False
    else:
        return True

In [None]:
cohort_df = pd.read_csv("../A_Cohort/final_cohort_07312024.csv", low_memory = False)

In [None]:
df_comparison = cohort_df[cohort_df.exposure_group.isin(['new user', 'consistent user'])]

In [None]:
# Map the 'deceased' column to boolean values
death_mapping_dict = {'Y': True, 'N': False}
df_comparison = df_comparison.assign(
    deceased_boolean=df_comparison['deceased'].map(death_mapping_dict)
)

# Map the 'exposure_group' column to numeric values
exposure_mapping_dict = {'new user': 1, 'consistent user': 0}
df_comparison = df_comparison.assign(
    exposure_float=df_comparison['exposure_group'].map(exposure_mapping_dict)
)

# Display the count of each category in the 'exposure_group' column
exposure_group_counts = df_comparison['exposure_group'].value_counts()
print(exposure_group_counts)

In [None]:
medication_table_name = 'dementia_medication_categories'
medication_table_id = f"{db}.{yh_ds}.{medication_table_name}"
medication_table = client.get_table(medication_table_id)

comorbidity_table_name = 'dementia_comorbidity_categories_aggregated'
comorbidity_table_id = f"{db}.{yh_ds}.{comorbidity_table_name}"
comorbidity_table = client.get_table(comorbidity_table_id)

comorbidity_before_exposure_table_name = 'dementia_comorbidity_categories_before_exposure_aggregated'
comorbidity_before_exposure_table_id = f"{db}.{yh_ds}.{comorbidity_before_exposure_table_name}"
comorbidity_before_exposure_table = client.get_table(comorbidity_before_exposure_table_id)

# Get the list of column names
comorbidity_column_names = [schema_field.name for schema_field in comorbidity_table.schema]
comorbidity_before_exposure_table_column_names = [schema_field.name for schema_field in comorbidity_before_exposure_table.schema]
medication_column_names = [schema_field.name for schema_field in medication_table.schema]

In [None]:
columns_list = df_comparison.columns.to_list()
#columns_list

In [None]:
comorbid_columns = [i for i in comorbidity_before_exposure_table_column_names if i.startswith('before_exposure')]
medication_columns = [i for i in medication_column_names if i.startswith('exposure_within_1_year_before_first')]
print (len(comorbid_columns))
print (len(medication_columns))
health_covariate_columns = comorbid_columns + medication_columns

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Feature engineering: Imputation and BMI categorization
df_imputed = df_comparison.copy()

# Convert 'last_bmi_before_exposure' to numeric, handling invalid values
df_imputed['last_bmi_before_exposure'] = pd.to_numeric(
    df_imputed['last_bmi_before_exposure'], errors='coerce'
)

# Impute missing BMI values with the median
median_value = df_imputed['last_bmi_before_exposure'].median()
df_imputed['last_bmi_before_exposure'].fillna(median_value, inplace=True)

# Define BMI bins and labels, including an upper boundary for extreme BMIs
bins = [0, 18.5, 24.9, 29.9, 34.9, 40, float('inf')]
labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese', 'Severely obese', 'Morbidly obese']
category_to_numeric = {
    'Underweight': 1,
    'Normal weight': 2,
    'Overweight': 3,
    'Obese': 4,
    'Severely obese': 5,
    'Morbidly obese': 6
}

# Categorize BMI into ordinal groups
df_imputed['last_bmi_before_exposure_category'] = pd.cut(
    df_imputed['last_bmi_before_exposure'], bins=bins, labels=labels, right=False
)

# Check for uncategorized values
uncategorized_values = df_imputed[df_imputed['last_bmi_before_exposure_category'].isnull()]
print(f"Number of uncategorized rows: {uncategorized_values.shape[0]}")

# Map BMI categories to numeric ordinal values
df_imputed['last_bmi_before_exposure_ordinal'] = df_imputed['last_bmi_before_exposure_category'].map(category_to_numeric)

# Ensure BMI ordinal column is numeric
df_imputed['last_bmi_before_exposure_ordinal'] = df_imputed['last_bmi_before_exposure_ordinal'].astype(float)

# Define covariate columns
other_covariates = ['age_at_diagnosis', 'sex', 'ethnic_group', 'race', 'last_bmi_before_exposure_ordinal', 'mapped_insurance_type']
covariate_columns = health_covariate_columns + other_covariates

# Identify numerical covariates with more than two unique values
numerical_covariates = [
    col for col in df_imputed[covariate_columns].select_dtypes(include=['float64', 'int64']).columns
    if df_imputed[col].nunique() > 2
]

# Identify binary categorical covariates (exactly two unique values)
binary_categorical_covariates = [
    col for col in df_imputed[covariate_columns].columns
    if df_imputed[col].nunique() == 2
]

# Identify non-binary categorical covariates
categorical_covariates = df_imputed[covariate_columns].select_dtypes(include=['object', 'category']).columns.tolist()

# Combine binary and non-binary categorical covariates
all_categorical_covariates = categorical_covariates + binary_categorical_covariates

# Create imputers for numerical and categorical covariates
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for numerical covariates
df_imputed[numerical_covariates] = median_imputer.fit_transform(df_imputed[numerical_covariates])

# Impute missing values for categorical covariates
df_imputed[all_categorical_covariates] = mode_imputer.fit_transform(df_imputed[all_categorical_covariates])

# Create dummy variables for categorical covariates
df_imputed_with_dummies = pd.get_dummies(
    df_imputed, columns=categorical_covariates, drop_first=True
)

# Identify dummy columns (newly created after one-hot encoding)
dummy_columns = df_imputed_with_dummies.columns.difference(df_imputed.columns)

# Combine binary and dummy columns for covariate analysis
categorical_covariate_columns_with_dummies = (
    list(df_imputed_with_dummies.columns[df_imputed_with_dummies.columns.isin(all_categorical_covariates)]) +
    list(dummy_columns)
)

# Define a threshold for minimum samples in categorical variables
min_samples_threshold = 300
sparse_columns = [
    col for col in categorical_covariate_columns_with_dummies
    if df_imputed_with_dummies[col].value_counts().min() < min_samples_threshold
]

# Retain non-sparse columns for analysis
non_sparse_columns = numerical_covariates + [
    col for col in categorical_covariate_columns_with_dummies if col not in sparse_columns
]

# Drop sparse columns to create the final analysis DataFrame
df_analysis = df_imputed_with_dummies.drop(columns=sparse_columns)
comorbid_columns = [i for i in comorbidity_before_exposure_table_column_names if i.startswith('before_exposure')]
medication_columns = [i for i in medication_column_names if i.startswith('exposure_within_1_year_before_first')]
print (len(comorbid_columns))
print (len(medication_columns))
health_covariate_columns = comorbid_columns + medication_columns

In [None]:
df_analysis['death3000_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 3000), axis=1)
df_analysis['death3000_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 3000), axis=1)
df_analysis['death14_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 14), axis=1)
df_analysis['death14_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 14), axis=1)
df_analysis['death28_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 28), axis=1)
df_analysis['death28_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 28), axis=1)
df_analysis['death30_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 30), axis=1)
df_analysis['death30_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 30), axis=1)
df_analysis['death60_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 60), axis=1)
df_analysis['death60_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 60), axis=1)
df_analysis['death90_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 90), axis=1)
df_analysis['death90_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 90), axis=1)
df_analysis['death180_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 180), axis=1)
df_analysis['death180_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 180), axis=1)
df_analysis['death365_duration'] = df_analysis.apply(lambda row: get_death_duration_column(row['post_onset_post_opioid_death_days'], 365), axis=1)
df_analysis['death365_outcome'] = df_analysis.apply(lambda row: get_death_outcome_column(row['post_onset_post_opioid_death_days'], 365), axis=1)

## subgroup analysis


### longterm consistent users

In [None]:
df_analysis2 = df_analysis[~((df_analysis['exposure_group']=='consistent user')&(df_analysis['pre_longterm_opioid_consistent']==0))]

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Set font to Helvetica globally for the plot
plt.rcParams['font.family'] = 'Helvetica'

# Prepare data for lifelines CoxPHFitter
x_cols = ['exposure_float'] + non_sparse_columns
X_lifelines = df_analysis2[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = df_analysis2['death14_duration']
X_lifelines['event'] = df_analysis2['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios and confidence intervals
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']
# Print hazard ratios and confidence intervals
print("Hazard Ratios (Exponentiated Coefficients):")
print(hazard_ratios)

print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(10, 6))
# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves
for value in df_analysis2['exposure_group'].unique():
    mask = df_analysis2['exposure_group'] == value
    kmf.fit(durations=df_analysis2['death14_duration'][mask],
            event_observed=df_analysis2['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the specified colors
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.5, 1)

# Enhance plot labels and title for publication quality
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=14)
plt.xlabel("Time (days)", fontsize=14)
plt.title("14-days Survival Curve of New User and Longterm Consistent User", fontsize=16)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

plt.savefig("../../figures/longterm_consistent_14days_survival_curves.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()


### strong opioid

In [None]:
df_analysis3 = df_analysis[df_analysis.opioid_strength_classification == 2]

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt
df_analysis3 = df_analysis[df_analysis.opioid_strength_classification == 2]
# Set font to Helvetica globally for the plot
plt.rcParams['font.family'] = 'Helvetica'

# Prepare data for lifelines CoxPHFitter
x_cols = ['exposure_float'] + non_sparse_columns
X_lifelines = df_analysis3[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = df_analysis3['death14_duration']
X_lifelines['event'] = df_analysis3['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in df_analysis3['exposure_group'].unique():
    mask = df_analysis3['exposure_group'] == value
    kmf.fit(durations=df_analysis3['death14_duration'][mask],
            event_observed=df_analysis3['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.5, 1)

# Enhance plot labels and title for publication quality with larger font sizes
plt.ylabel(r"Survival probability", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Survival Curves by Strong Opioid Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_with_significance.svg", format="svg", bbox_inches="tight")
#plt.savefig("../../figures/strong_opioid_14days_survival_curves.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()


In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Filter the dataset for opioid strength classification == 1
df_analysis4 = df_analysis[df_analysis.opioid_strength_classification == 1]

# Prepare data for lifelines CoxPHFitter
x_cols = ['exposure_float'] + non_sparse_columns
X_lifelines = df_analysis4[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = df_analysis4['death14_duration']
X_lifelines['event'] = df_analysis4['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in df_analysis4['exposure_group'].unique():
    mask = df_analysis4['exposure_group'] == value
    kmf.fit(durations=df_analysis4['death14_duration'][mask],
            event_observed=df_analysis4['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.7, 1)

# Enhance plot labels and title for publication quality with larger font sizes
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Survival Curves by Weak Opioid Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_opioid_strength_1.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/weak_opioid_14days_survival_curves_opioid_strength.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()


In [None]:
# Adjusting the function to save and display the figure with much larger fonts
def analyze_and_plot_and_save(df_strong, df_weak, ylim_strong, ylim_weak, title_strong, title_weak, save_path):
    fig, axes = plt.subplots(1, 2, figsize=(30, 10))  # Much larger figure size

    # Strong Opioid Analysis
    kmf = KaplanMeierFitter()
    color_mapping = {'new user': '#FF6347', 'consistent user': '#4682B4'}

    for value in df_strong['exposure_group'].unique():
        mask = df_strong['exposure_group'] == value
        kmf.fit(
            durations=df_strong['death14_duration'][mask],
            event_observed=df_strong['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})"
        )
        kmf.plot_survival_function(ax=axes[0], ci_show=True, color=color_mapping.get(value, 'black'))

    axes[0].set_ylim(*ylim_strong)
    axes[0].set_ylabel(r"Survival probability", fontsize=30)  # Increased font size
    axes[0].set_xlabel("Time (days)", fontsize=30)
    axes[0].set_title(title_strong, fontsize=36)  # Larger title font
    axes[0].tick_params(axis='both', labelsize=28)  # Larger tick labels
    axes[0].legend(loc="best", fontsize=28, frameon=False)  # Larger legend font

    # Weak Opioid Analysis
    for value in df_weak['exposure_group'].unique():
        mask = df_weak['exposure_group'] == value
        kmf.fit(
            durations=df_weak['death14_duration'][mask],
            event_observed=df_weak['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})"
        )
        kmf.plot_survival_function(ax=axes[1], ci_show=True, color=color_mapping.get(value, 'black'))

    axes[1].set_ylim(*ylim_weak)
    axes[1].set_ylabel(r"Survival probability", fontsize=30)
    axes[1].set_xlabel("Time (days)", fontsize=30)
    axes[1].set_title(title_weak, fontsize=36)
    axes[1].tick_params(axis='both', labelsize=28)
    axes[1].legend(loc="best", fontsize=28, frameon=False)

    # Adjust layout and save the figure
    plt.tight_layout()
    plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.show()  # Display the figure

# Call the function with larger fonts and save the figure
analyze_and_plot_and_save(
    df_analysis_strong, 
    df_analysis_weak, 
    ylim_strong=(0.5, 1), 
    ylim_weak=(0.5, 1),
    title_strong="14-days Survival by Strong Opioid Exposure",
    title_weak="14-days Survival by Weak Opioid Exposure",
    save_path="../../figures/opioid_strength_14days_survival_curves_opioid_strength.pdf"
)


## dementia and MCI subgroup 

In [None]:
MCI = df_analysis[df_analysis.MCI == 1]
dementia = df_analysis[(df_analysis.AD == 1)|(df_analysis.FTD == 1)|(df_analysis.VD == 1)|(df_analysis.LBD == 1)|(df_analysis.other_D == 1)]

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Prepare data for lifelines CoxPHFitter
x_cols = ['exposure_float'] + non_sparse_columns
X_lifelines = MCI[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = MCI['death14_duration']
X_lifelines['event'] = MCI['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in MCI['exposure_group'].unique():
    mask = MCI['exposure_group'] == value
    kmf.fit(durations=MCI['death14_duration'][mask],
            event_observed=MCI['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.7, 1)

# Enhance plot labels and title for publication quality with larger font sizes
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Survival Curves by MCI Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_MCI.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/MCI_14days_survival_curves.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Prepare data for lifelines CoxPHFitter
new_list = [x for x in non_sparse_columns if x != 30]
x_cols = ['exposure_float'] + new_list  # Updated to exclude column 30
X_lifelines = dementia[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = dementia['death14_duration']
X_lifelines['event'] = dementia['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in dementia['exposure_group'].unique():
    mask = dementia['exposure_group'] == value
    kmf.fit(durations=dementia['death14_duration'][mask],
            event_observed=dementia['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.7, 1)

# Enhance plot labels and title for publication quality with larger font sizes
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Survival Curves by Dementia Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_dementia.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/dementia_14days_survival.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()


In [None]:
def plot_and_save_mci_dementia_survival_curves(df_mci, df_dementia, titles, ylim, save_path):
    fig, axes = plt.subplots(1, 2, figsize=(30, 10))
    kmf = KaplanMeierFitter()
    color_mapping = {'new user': '#FF6347', 'consistent user': '#4682B4'}
    
    for i, (df, ax, title, y_lim) in enumerate(zip([df_mci, df_dementia], axes, titles, ylim)):
        for group in df['exposure_group'].unique():
            mask = df['exposure_group'] == group
            kmf.fit(df['death14_duration'][mask], df['death14_outcome'][mask], label=f"{group} (n={mask.sum()})")
            kmf.plot_survival_function(ax=ax, ci_show=True, color=color_mapping.get(group, 'black'))
        
        ax.set_ylim(*y_lim)
        ax.set_title(title, fontsize=36)
        ax.set_xlabel("Time (days)", fontsize=30)
        ax.set_ylabel("Survival probability", fontsize=30)
        ax.tick_params(labelsize=28)
        ax.legend(loc="best", fontsize=28, frameon=False)
    
    plt.tight_layout()
    plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.show()

# Define MCI and Dementia subsets
df_mci = df_analysis[df_analysis.MCI == 1]
df_dementia = df_analysis[
    (df_analysis.AD == 1) |
    (df_analysis.FTD == 1) |
    (df_analysis.VD == 1) |
    (df_analysis.LBD == 1) |
    (df_analysis.other_D == 1)
]

# Example usage
# plot_and_save_mci_dementia_survival_curves(
#     df_mci=df_mci,
#     df_dementia=df_dementia,
#     titles=["MCI", "Dementia"],
#     ylim=[(0.5, 1), (0.5, 1)],
#     save_path="../../figures/mci_dementia_survival_curves.pdf"

#### pneumonia analysis 

In [None]:
df_pneumonia = df_analysis[df_analysis.pneumonia_before_7days_exposure == 1]

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Prepare data for lifelines CoxPHFitter
new_list = [x for x in non_sparse_columns if x != 30]
x_cols = ['exposure_float'] + non_sparse_columns  # Make sure '30' is excluded
X_lifelines = df_pneumonia[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = df_pneumonia['death14_duration']
X_lifelines['event'] = df_pneumonia['death14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in df_pneumonia['exposure_group'].unique():
    mask = df_pneumonia['exposure_group'] == value
    kmf.fit(durations=df_pneumonia['death14_duration'][mask],
            event_observed=df_pneumonia['death14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.5, 1)

# Enhance plot labels and title for publication quality with larger font sizes
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Survival Curves by Pneumonia Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_pneumonia.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/pneumonia_14days_survival_curves.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()

In [None]:
df_non_pneumonia = df_analysis[df_analysis.pneumonia_before_7days_exposure != 1]

In [None]:
df_non_pneumonia.pneumonia_after_exposure.value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df_non_pneumonia = df_analysis[df_analysis.pneumonia_before_7days_exposure != 1]
# Plot the distribution of a specific column (replace 'column_name' with your actual column name)
plt.figure(figsize=(10, 6))
sns.histplot(df_non_pneumonia['pneumonia_days_since_exposure'], kde=True, bins=30)

# Add labels and title
plt.title("Distribution of 'column_name'", fontsize=16)
plt.xlabel('Values', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

# Display the plot
plt.show()

In [None]:
def get_pneumonia_duration_column(pneumonia_after_exposure,  observation_days): 
    if pd.isna(pneumonia_after_exposure):
        return observation_days
    elif pneumonia_after_exposure > observation_days: # when the patient died after the observation cut off 
        return observation_days 
    elif pneumonia_after_exposure <= 0:
        return 0
    else:
        return pneumonia_after_exposure
def get_pneumonia_outcome_column(pneumonia_after_exposure, observation_days): 
    if pd.isna(pneumonia_after_exposure):
        return False
    elif pneumonia_after_exposure > observation_days: 
        return False
    else:
        return True

In [None]:
df_non_pneumonia['pneumonia14_duration'] = df_non_pneumonia.apply(lambda row: get_pneumonia_duration_column(row["pneumonia_days_since_exposure"], 14), axis=1)
df_non_pneumonia['pneumonia14_outcome'] = df_non_pneumonia.apply(lambda row: get_pneumonia_outcome_column(row["pneumonia_days_since_exposure"], 14), axis=1)
df_non_pneumonia['pneumonia60_duration'] = df_non_pneumonia.apply(lambda row: get_pneumonia_duration_column(row["pneumonia_days_since_exposure"], 60), axis=1)
df_non_pneumonia['pneumonia60_outcome'] = df_non_pneumonia.apply(lambda row: get_pneumonia_outcome_column(row["pneumonia_days_since_exposure"], 60), axis=1)

In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt
df_non_pneumonia = df_analysis[df_analysis.pneumonia_before_7days_exposure != 1]
def get_pneumonia_duration_column(pneumonia_after_exposure,  observation_days): 
    if pd.isna(pneumonia_after_exposure):
        return observation_days
    elif pneumonia_after_exposure > observation_days: # when the patient died after the observation cut off 
        return observation_days 
    elif pneumonia_after_exposure <= 0:
        return 0
    else:
        return pneumonia_after_exposure
def get_pneumonia_outcome_column(pneumonia_after_exposure, observation_days): 
    if pd.isna(pneumonia_after_exposure):
        return False
    elif pneumonia_after_exposure > observation_days: 
        return False
    else:
        return True
df_non_pneumonia['pneumonia14_duration'] = df_non_pneumonia.apply(lambda row: get_pneumonia_duration_column(row["pneumonia_days_since_exposure"], 14), axis=1)
df_non_pneumonia['pneumonia14_outcome'] = df_non_pneumonia.apply(lambda row: get_pneumonia_outcome_column(row["pneumonia_days_since_exposure"], 14), axis=1)
df_non_pneumonia['pneumonia60_duration'] = df_non_pneumonia.apply(lambda row: get_pneumonia_duration_column(row["pneumonia_days_since_exposure"], 60), axis=1)
df_non_pneumonia['pneumonia60_outcome'] = df_non_pneumonia.apply(lambda row: get_pneumonia_outcome_column(row["pneumonia_days_since_exposure"], 60), axis=1)
# Prepare data for lifelines CoxPHFitter
new_list = [x for x in non_sparse_columns if x != 30]
x_cols = ['exposure_float'] + non_sparse_columns  # Make sure '30' is excluded
X_lifelines = df_non_pneumonia[x_cols].copy()

# Add 'duration' and 'event' columns
X_lifelines['duration'] = df_non_pneumonia['pneumonia14_duration']
X_lifelines['event'] = df_non_pneumonia['pneumonia14_outcome']

# Fit the Cox Proportional Hazards Model using lifelines
cph = CoxPHFitter()
cph.fit(X_lifelines, duration_col='duration', event_col='event')

# Get hazard ratios, confidence intervals, and p-values
hazard_ratios = cph.hazard_ratios_
conf_int = np.exp(cph.confidence_intervals_)
p_values = cph.summary['p']

# Print hazard ratios and confidence intervals with statistical significance
print("Hazard Ratios (Exponentiated Coefficients) with Statistical Significance:")
for var in X_lifelines.columns:
    hr = hazard_ratios.get(var, np.nan)
    ci_low = conf_int.loc[var, '95% lower-bound'] if var in conf_int.index else np.nan
    ci_high = conf_int.loc[var, '95% upper-bound'] if var in conf_int.index else np.nan
    p_value = p_values.get(var, np.nan)
    
    # Determine the number of stars based on the p-value
    if p_value <= 0.0001:
        stars = "****"
    elif p_value <= 0.001:
        stars = "***"
    elif p_value <= 0.01:
        stars = "**"
    elif p_value <= 0.05:
        stars = "*"
    else:
        stars = ""
    
    print(f"{var}: HR = {hr:.2f} ({ci_low:.2f}, {ci_high:.2f}) {p_value} {stars}")

# Kaplan-Meier estimator plot
plt.figure(figsize=(12, 8))  # Increased figure size for better readability

# Use lifelines KaplanMeierFitter to plot survival curves
kmf = KaplanMeierFitter()

# Define color scheme
color_mapping = {
    'new user': '#FF6347',  # Tomato
    'consistent user': '#4682B4'  # SteelBlue
}

# Loop through each exposure group to plot survival curves with specified colors
for value in df_non_pneumonia['exposure_group'].unique():
    mask = df_non_pneumonia['exposure_group'] == value
    kmf.fit(durations=df_non_pneumonia['pneumonia14_duration'][mask],
            event_observed=df_non_pneumonia['pneumonia14_outcome'][mask],
            label=f"{value} (n = {mask.sum()})")
    
    # Plot the Kaplan-Meier curve for this group using the color scheme
    kmf.plot_survival_function(ci_show=True, color=color_mapping.get(value, 'black'))

# Set the limits for the y-axis to focus on the survival probability range
plt.ylim(0.7, 1)

# Enhance plot labels and title for publication qualdf_non_pneumoniaity with larger font sizes
plt.ylabel(r"Estimated Probability of Survival $\hat{S}(t)$", fontsize=18)
plt.xlabel("Time (days)", fontsize=18)
plt.title("14-days Pneumonia Survival Curves by Exposure Group", fontsize=20)

# Customize tick marks for readability with larger font sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Adjust legend and remove frame, with increased font size
plt.legend(loc="best", fontsize=16, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("kaplan_meier_survival_curves_pneumonia.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/nonpneumonia_14days_pneumonia_curves.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.show()

In [None]:
def plot_combined_survival(df1, df2, duration1, event1, duration2, event2, title1, title2, ylim1, ylim2, save_path):
    """Plots two survival curves in a combined top-bottom layout."""
    from lifelines import KaplanMeierFitter
    import matplotlib.pyplot as plt

    fig, axes = plt.subplots(2, 1, figsize=(12, 16))  # Top and bottom layout
    kmf = KaplanMeierFitter()
    color_mapping = {'new user': '#FF6347', 'consistent user': '#4682B4'}

    # Top Plot
    for group in df1['exposure_group'].unique():
        mask = df1['exposure_group'] == group
        kmf.fit(df1[duration1][mask], df1[event1][mask], label=f"{group} (n={mask.sum()})")
        kmf.plot_survival_function(ax=axes[0], ci_show=True, color=color_mapping.get(group, 'black'))

    axes[0].set_ylim(*ylim1)
    axes[0].set_title(title1, fontsize=30)
    axes[0].set_xlabel("Time (days)", fontsize=20)
    axes[0].set_ylabel("Survival probability", fontsize=20)
    axes[0].tick_params(axis='both', labelsize=20)
    axes[0].legend(loc="best", fontsize=20, frameon=False)

    # Bottom Plot
    for group in df2['exposure_group'].unique():
        mask = df2['exposure_group'] == group
        kmf.fit(df2[duration2][mask], df2[event2][mask], label=f"{group} (n={mask.sum()})")
        kmf.plot_survival_function(ax=axes[1], ci_show=True, color=color_mapping.get(group, 'black'))

    axes[1].set_ylim(*ylim2)
    axes[1].set_title(title2, fontsize=30)
    axes[1].set_xlabel("Time (days)", fontsize=20)
    axes[1].set_ylabel("Survival probability", fontsize=20)
    axes[1].tick_params(axis='both', labelsize=20)
    axes[1].legend(loc="best", fontsize=20, frameon=False)

    plt.tight_layout()
    plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.show()


# Example usage
plot_combined_survival(
    df1=df_pneumonia,
    df2=df_non_pneumonia,
    duration1='death14_duration',
    event1='death14_outcome',
    duration2='pneumonia14_duration',
    event2='pneumonia14_outcome',
    title1="Survival curves in patients with underlying pneumonia",
    title2="Pneumonia survival curves in patients without underlying pneumonia",
    ylim1=(0.5, 1),
    ylim2=(0.5, 1),
    #save_path="../../figures/combined_survival_curves.pdf"
)