In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
MIN_CONFIDENCE = 0.14

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# == Data Exploration ==

print(df.columns)
# set(df['avg_glucose_level'] )

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [4]:
# == Data Preprocessing ==

# Fill bmi with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 1544 patients have smoking_status as Unknown
# For patients with age < 10 we set smoking_status to 'never smoked'
df.loc[df['age'] < 10, 'smoking_status'] = 'never smoked'

df['hypertension'] = df['hypertension'].astype(bool)
df['heart_disease'] = df['heart_disease'].astype(bool)
df['stroke'] = df['stroke'].astype(bool)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,False,True,Yes,Private,Urban,228.69,36.6,formerly smoked,True
1,51676,Female,61.0,False,False,Yes,Self-employed,Rural,202.21,28.1,never smoked,True
2,31112,Male,80.0,False,True,Yes,Private,Rural,105.92,32.5,never smoked,True
3,60182,Female,49.0,False,False,Yes,Private,Urban,171.23,34.4,smokes,True
4,1665,Female,79.0,True,False,Yes,Self-employed,Rural,174.12,24.0,never smoked,True
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,True,False,Yes,Private,Urban,83.75,28.1,never smoked,False
5106,44873,Female,81.0,False,False,Yes,Self-employed,Urban,125.20,40.0,never smoked,False
5107,19723,Female,35.0,False,False,Yes,Self-employed,Rural,82.99,30.6,never smoked,False
5108,37544,Male,51.0,False,False,Yes,Private,Rural,166.29,25.6,formerly smoked,False


In [5]:
# == Preprocess for Apriori ==
df1 = df

# Binning
df1['age_bin'] = pd.cut(df1['age'], bins = [0, 18, 40, 60, 100], labels = ['child', 'young', 'middle_aged', 'senior'])
df1['avg_glucose_level_bin'] = pd.cut(df1['avg_glucose_level'], bins = [50, 90, 130, 200, 300], labels = ['low', 'normal', 'high', 'very_high'])
df1['bmi_bin'] = pd.cut(df1['bmi'], bins = [0, 18.5, 25, 30, 100], labels = ['underweight', 'normal', 'overweight', 'obese'])

df1 = df1.drop(columns=['age', 'avg_glucose_level', 'bmi'])

df1 = pd.get_dummies(df1)
df1 = df1.drop(columns='id')
df1.columns

Index(['hypertension', 'heart_disease', 'stroke', 'gender_Female',
       'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age_bin_child', 'age_bin_young',
       'age_bin_middle_aged', 'age_bin_senior', 'avg_glucose_level_bin_low',
       'avg_glucose_level_bin_normal', 'avg_glucose_level_bin_high',
       'avg_glucose_level_bin_very_high', 'bmi_bin_underweight',
       'bmi_bin_normal', 'bmi_bin_overweight', 'bmi_bin_obese'],
      dtype='object')

In [6]:
# == Apriori Algorithm ==
from mlxtend.frequent_patterns import apriori, association_rules

freq_itemsets = apriori(df1, min_support=0.01, use_colnames=True)

maximal_itemsets = []
for itemset in freq_itemsets['itemsets']:
    flag = True
    for other in freq_itemsets['itemsets']:
        if itemset == other:
            continue
        if itemset.issubset(other):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_freq_itemsets = freq_itemsets[freq_itemsets['itemsets'].isin(maximal_itemsets)]
rules = association_rules(freq_itemsets, metric='lift', min_threshold=1.0)

stroke_rules = rules[rules['consequents'] == frozenset({'stroke'})].sort_values(by='confidence', ascending=False)
stroke_rules = stroke_rules[stroke_rules['confidence'] >= MIN_CONFIDENCE]

In [7]:
factors = dict(zip(stroke_rules['antecedents'], stroke_rules['confidence']))

print('Causal Factors:')
print(' ' + '-' * 82)
print(f'| {"Confidence":^10} | {"Factors":^67} |')
print(f'| {"-"*10} | {"-"*67} |')
for f in factors:
  print(f'| {factors[f]:<10.3f} | {", ".join(f):<67} |')
print(' ' + '-' * 82)

Causal Factors:
 ----------------------------------------------------------------------------------
| Confidence |                               Factors                               |
| ---------- | ------------------------------------------------------------------- |
| 0.191      | Residence_type_Urban, age_bin_senior, bmi_bin_overweight            |
| 0.184      | age_bin_senior, hypertension                                        |
| 0.157      | age_bin_senior, bmi_bin_overweight                                  |
| 0.157      | age_bin_senior, Residence_type_Urban, work_type_Private             |
| 0.149      | age_bin_senior, ever_married_Yes, bmi_bin_overweight                |
| 0.147      | age_bin_senior, work_type_Private                                   |
| 0.146      | age_bin_senior, smoking_status_formerly smoked, ever_married_Yes    |
| 0.145      | gender_Male, ever_married_Yes, age_bin_senior                       |
| 0.144      | gender_Male, age_bin_senior        

In [8]:
rules.sort_values(by='confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
16099,"(gender_Male, Residence_type_Urban, work_type_...",(age_bin_child),0.036791,0.179256,0.036791,1.000000,5.578603,1.0,0.030196,inf,0.852093,0.205240,1.000000,0.602620
90324,"(gender_Male, Residence_type_Rural, avg_glucos...",(age_bin_child),0.011155,0.179256,0.011155,1.000000,5.578603,1.0,0.009155,inf,0.830002,0.062227,1.000000,0.531114
89517,"(avg_glucose_level_bin_low, smoking_status_nev...","(age_bin_child, ever_married_No)",0.013307,0.179061,0.013307,1.000000,5.584699,1.0,0.010924,inf,0.832011,0.074317,1.000000,0.537158
89510,"(avg_glucose_level_bin_low, smoking_status_nev...",(ever_married_No),0.013307,0.343836,0.013307,1.000000,2.908367,1.0,0.008732,inf,0.665014,0.038702,1.000000,0.519351
89512,"(ever_married_No, avg_glucose_level_bin_low, s...",(age_bin_child),0.013307,0.179256,0.013307,1.000000,5.578603,1.0,0.010922,inf,0.831813,0.074236,1.000000,0.537118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75502,(ever_married_Yes),"(age_bin_senior, Residence_type_Rural, avg_glu...",0.656164,0.011350,0.010176,0.015508,1.366352,1.0,0.002728,1.004224,0.779804,0.015481,0.004206,0.456030
29593,(ever_married_Yes),"(gender_Male, hypertension, bmi_bin_obese, age...",0.656164,0.010763,0.010176,0.015508,1.440881,1.0,0.003114,1.004820,0.889902,0.015495,0.004797,0.480482
65236,(ever_married_Yes),"(avg_glucose_level_bin_low, smoking_status_Unk...",0.656164,0.010568,0.010176,0.015508,1.467564,1.0,0.003242,1.005019,0.926601,0.015499,0.004994,0.489236
5537,(ever_married_Yes),"(hypertension, avg_glucose_level_bin_very_high...",0.656164,0.010568,0.010176,0.015508,1.467564,1.0,0.003242,1.005019,0.926601,0.015499,0.004994,0.489236
