<a href="https://colab.research.google.com/github/senonaderian/ADRD-an-ADR_Detection_NLP-framework/blob/main/Step_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install upsetplot

In [None]:
from upsetplot import UpSet, from_indicators

In [None]:
df = pd.read_excel("/content/blinded-data.xlsx")

In [None]:
df = pd.read_excel("/content/blinded-data.xlsx", header=1)

In [None]:
print(df.columns.tolist())

In [None]:
followup_cols = [
    'FollowUpReStroke', 'FollowUpSeizure', 'FollowUpAspirationPneumonia',
    'FollowUpUTI', 'FollowUpSepsis', 'FollowUpBedsoresPressureulcers',
    'FollowUpDVT', 'FollowUpPTE', 'FollowUpMI', 'FollowUpGIB', 'FollowUpDeath'
]

In [None]:
# Follow-Up Complications quick summary
comp_df = df[followup_cols].sum().reset_index()
comp_df.columns = ['Complication', 'Patient Count']
comp_df['% of Patients'] = (comp_df['Patient Count'] / len(df) * 100).round(2)
comp_df['Per 100 Patients'] = comp_df['% of Patients'].round(2)
comp_df['Cumulative %'] = comp_df['% of Patients'].cumsum().round(2)

comp_df.sort_values(by='Patient Count', ascending=False).reset_index(drop=True)

In [None]:
complication_counts = df[followup_cols].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=complication_counts.values, y=complication_counts.index)
plt.title("Frequency of Follow-Up Complications")
plt.xlabel("Number of Patients")
plt.ylabel("Complication Type")
plt.tight_layout()
plt.show()

In [None]:
stroke_cols = [
    'SAH', 'SAHICA Siphon/Bifurc', 'SAHMCA', 'SAHAComA', 'SAHPComA',
    'SAHBA', 'SAHVA', 'SAHOtherIntracranial', 'SAHIntracranialUnsp',
    'SAHOther', 'SAHUnsp', 'ICH', 'ICHHemSubcortical', 'ICHHemCortical ',
    'ICHHemUnsp', 'ICHBrainstem', 'ICHCerebellum', 'ICHIntraventricular',
    'ICHMultLocalized', 'ICHOther', 'ICHUnsp', 'CI', 'CILAA', 'CICE',
    'CI-Lacunar', 'CIESUS', 'CIUnknown', 'CIArterialDissection', 'CIOthers',
    'CVT', 'TIA'
]


In [None]:
# Stroke Subtypes quick summary
stroke_counts = df[stroke_cols].sum()
stroke_df = stroke_counts.reset_index()
stroke_df.columns = ['Stroke Subtype', 'Patient Count']
stroke_df['% of Patients'] = (stroke_df['Patient Count'] / len(df) * 100).round(2)
stroke_df['Per 100 Patients'] = stroke_df['% of Patients'].round(2)
stroke_df['Cumulative %'] = stroke_df['% of Patients'].cumsum().round(2)

stroke_df.sort_values(by='Patient Count', ascending=False).reset_index(drop=True)


In [None]:
stroke_counts = df[stroke_cols].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x=stroke_counts.values, y=stroke_counts.index)
plt.title("Frequency of Stroke Types and Locations")
plt.xlabel("Number of Patients")
plt.ylabel("Stroke Subtype")
plt.tight_layout()
plt.show()

In [None]:
intervention_cols = [
    'HematomaEvacuation', 'Clipping', 'Decompressivecraniectomy', 'Endarterectomy',
    'Shunting', 'ExternalVentricularDrain', 'Coiling', 'Stenting',
    'MechanicalThrombectomy', 'Embolization', 'IntraArterialtPA', 'IntravenoustPA'
]

In [None]:
# Interventions quick summary
interv_counts = df[intervention_cols].sum()
interv_df = interv_counts.reset_index()
interv_df.columns = ['Procedure', 'Patient Count']
interv_df['% of Patients'] = (interv_df['Patient Count'] / len(df) * 100).round(2)
interv_df['Per 100 Patients'] = interv_df['% of Patients'].round(2)
interv_df['Cumulative %'] = interv_df['% of Patients'].cumsum().round(2)

interv_df.sort_values(by='Patient Count', ascending=False).reset_index(drop=True)


In [None]:
intervention_counts = df[intervention_cols].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=intervention_counts.values, y=intervention_counts.index)
plt.title("Frequency of Stroke-Related Interventions")
plt.xlabel("Number of Patients")
plt.ylabel("Procedure")
plt.tight_layout()
plt.show()

In [None]:
tpa_df = df[df['tpaDosage'].notna() & (df['tpaDosage'] > 0)]

In [None]:
# tPA dosage quick summary
tpa_summary = tpa_df['tpaDosage'].describe().round(2)
tpa_metrics = tpa_summary[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

# Rename for clarity
tpa_metrics.index = [
    'Count of Patients', 'Mean (mg)', 'Std Dev (mg)', 'Min (mg)',
    '25th Percentile', 'Median (50%)', '75th Percentile', 'Max (mg)'
]

tpa_metrics.to_frame(name='tPA Dosage Summary')

In [None]:
plt.figure(figsize=(7, 5))
sns.histplot(data=tpa_df, x='tpaDosage', bins=20, kde=False)
plt.title("Histogram of tPA Dosage")
plt.xlabel("tPA Dosage (mg)")
plt.ylabel("Number of Patients")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
sns.boxplot(data=tpa_df, y='tpaDosage')
plt.title("Box Plot of tPA Dosage")
plt.ylabel("tPA Dosage (mg)")
plt.tight_layout()
plt.show()

In [None]:
# Heatmap — Stroke Types × Follow-Up Complications
#how often each stroke type has a complication.

stroke_main_types = ['CI', 'SAH', 'ICH']

comp_matrix = df.groupby(stroke_main_types)[followup_cols].sum()

plt.figure(figsize=(10, 6))
sns.heatmap(comp_matrix.T, annot=True, fmt="d", cmap="Blues")
plt.title("Follow-Up Complications by Stroke Type")
plt.xlabel("Stroke Type")
plt.ylabel("Complication")
plt.tight_layout()
plt.show()


In [None]:
#Heatmap — Stroke Types × Interventions
#Shows which procedures were more common in which stroke type.

intervention_matrix = df.groupby(stroke_main_types)[intervention_cols].sum()

plt.figure(figsize=(10, 6))
sns.heatmap(intervention_matrix.T, annot=True, fmt="d", cmap="Purples")
plt.title("Intervention Usage by Stroke Type")
plt.xlabel("Stroke Type")
plt.ylabel("Procedure")
plt.tight_layout()
plt.show()


In [None]:
#Stacked Bar — Patients by Stroke Type & Any Complication
#Quick view of how many patients had no vs. ≥1 complication per stroke type.
# Count if patient had any follow-up complication
df['HasAnyComplication'] = df[followup_cols].sum(axis=1) > 0

# Get patient counts per stroke type
comp_group = df.groupby(['CI', 'SAH', 'ICH', 'HasAnyComplication']).size().unstack(fill_value=0)

# Sum into simplified form
comp_group.index = comp_group.index.map(lambda x: f"{'CI' if x[0] else ''}{'SAH' if x[1] else ''}{'ICH' if x[2] else ''}".strip() or "Other")

comp_group = comp_group.groupby(comp_group.index).sum()

comp_group.plot(kind='bar', stacked=True, figsize=(8, 5), colormap='Set2')
plt.title("Patients With vs Without Complications by Stroke Type")
plt.ylabel("Number of Patients")
plt.xlabel("Stroke Type")
plt.legend(title="Any Follow-Up Complication")
plt.tight_layout()
plt.show()

In [None]:
# Follow-up complications
followup_cols = [
    'FollowUpReStroke', 'FollowUpSeizure', 'FollowUpAspirationPneumonia',
    'FollowUpUTI', 'FollowUpSepsis', 'FollowUpBedsoresPressureulcers',
    'FollowUpDVT', 'FollowUpPTE', 'FollowUpMI', 'FollowUpGIB', 'FollowUpDeath'
]

# Interventions
intervention_cols = [
    'HematomaEvacuation', 'Clipping', 'Decompressivecraniectomy', 'Endarterectomy',
    'Shunting', 'ExternalVentricularDrain', 'Coiling', 'Stenting',
    'MechanicalThrombectomy', 'Embolization', 'IntraArterialtPA', 'IntravenoustPA'
]

# Stroke type (main categories)
stroke_main_types = ['CI', 'SAH', 'ICH']
