In [97]:
import pandas as pd
import numpy as np

In [98]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# == Data Exploration ==

print(df.columns)
# set(df['avg_glucose_level'])

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [102]:
# == Data Preprocessing ==

# Fill bmi with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 1544 patients have smoking_status as Unknown
# For patients with age < 10 we set smoking_status to 'never smoked'
df.loc[df['age'] < 10, 'smoking_status'] = 'never smoked'
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.862035,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.699562,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.8,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,32.8,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [106]:
# == Preprocess for Apriori ==
df1 = df

# Binning
df1['age_bin'] = pd.cut(df1['age'], bins = [0, 18, 40, 60, 100], labels = ['child', 'young', 'middle_aged', 'senior'])
df1['avg_glucose_level_bin'] = pd.cut(df1['avg_glucose_level'], bins = [50, 90, 130,200,300], labels = ['low', 'normal', 'high', 'very_high'])
df1['bmi_bin'] = pd.cut(df1['bmi'], bins = [0, 18.5, 25, 30, 100], labels = ['underweight', 'normal', 'overweight', 'obese'])

df1 = df1.drop(columns=['age', 'avg_glucose_level', 'bmi'])

df1 = pd.get_dummies(df1)
df1 = df1.drop(columns='id')
df1.columns

Index(['hypertension', 'heart_disease', 'stroke', 'gender_Female',
       'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age_bin_child', 'age_bin_young',
       'age_bin_middle_aged', 'age_bin_senior', 'avg_glucose_level_bin_low',
       'avg_glucose_level_bin_normal', 'avg_glucose_level_bin_high',
       'avg_glucose_level_bin_very_high', 'bmi_bin_underweight',
       'bmi_bin_normal', 'bmi_bin_overweight', 'bmi_bin_obese'],
      dtype='object')

In [181]:
# == Apriori Algorithm ==
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df1, min_support=0.01, use_colnames=True)

maximal_itemsets = []
for itemset in frequent_itemsets['itemsets']:
    flag = True
    for other in frequent_itemsets['itemsets']:
        if itemset == other:
            continue
        if itemset.issubset(other):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(maximal_itemsets)]
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
maximal_frequent_itemsets
rules[rules['consequents'] == frozenset({'stroke'})].sort_values(by='confidence', ascending=False)
# rules[rules['antecedents'] == frozenset({'stroke'})]
# rules



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
7683,"(age_bin_senior, bmi_bin_overweight, Residence...",(stroke),0.053229,0.048728,0.010176,0.191176,3.923340,1.0,0.007582,1.176118,0.787007,0.110874,0.149745,0.200006
386,"(age_bin_senior, hypertension)",(stroke),0.055186,0.048728,0.010176,0.184397,3.784215,1.0,0.007487,1.166342,0.778719,0.108559,0.142619,0.196616
1186,"(age_bin_senior, bmi_bin_overweight)",(stroke),0.105675,0.048728,0.016634,0.157407,3.230329,1.0,0.011485,1.128982,0.772017,0.120739,0.114246,0.249386
7671,"(work_type_Private, Residence_type_Urban, age_...",(stroke),0.067515,0.048728,0.010568,0.156522,3.212153,1.0,0.007278,1.127797,0.738545,0.100000,0.113315,0.186695
7646,"(age_bin_senior, bmi_bin_overweight, ever_marr...",(stroke),0.099804,0.048728,0.014873,0.149020,3.058194,1.0,0.010010,1.117854,0.747626,0.111274,0.105429,0.227120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,(work_type_Private),(stroke),0.572407,0.048728,0.029159,0.050940,1.045399,1.0,0.001266,1.002331,0.101562,0.049256,0.002326,0.324667
72,(bmi_bin_obese),(stroke),0.370450,0.048728,0.018787,0.050713,1.040740,1.0,0.000735,1.002091,0.062179,0.046921,0.002087,0.218128
1017,"(gender_Female, Residence_type_Urban)",(stroke),0.299217,0.048728,0.015068,0.050360,1.033486,1.0,0.000488,1.001718,0.046236,0.045267,0.001715,0.179798
1106,"(avg_glucose_level_bin_normal, ever_married_Yes)",(stroke),0.218591,0.048728,0.010763,0.049239,1.010488,1.0,0.000112,1.000538,0.013282,0.041953,0.000537,0.135061


In [173]:
# == Apriori Algorithm ==
df2 = df1[df1['stroke'] == 1] # dataframe only for patients which had stroke
df3 = df1[df1['stroke'] == 0] # dataframe only for patients which didn't had stroke

from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df2, min_support=0.3, use_colnames=True)

maximal_itemsets = []
for itemset in frequent_itemsets['itemsets']:
    flag = True
    for other in frequent_itemsets['itemsets']:
        if itemset == other:
            continue
        if itemset.issubset(other):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(maximal_itemsets)]
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
maximal_frequent_itemsets
# rules[rules['consequents'] == frozenset({'stroke'})]
rules[rules['antecedents'] == frozenset({'stroke'})]

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1,(stroke),(gender_Female),1.0,0.566265,0.566265,0.566265,1.0,1.0,0.0,1.0,0.0,0.566265,0.0,0.783133
3,(stroke),(gender_Male),1.0,0.433735,0.433735,0.433735,1.0,1.0,0.0,1.0,0.0,0.433735,0.0,0.716867
5,(stroke),(ever_married_Yes),1.0,0.883534,0.883534,0.883534,1.0,1.0,0.0,1.0,0.0,0.883534,0.0,0.941767
7,(stroke),(work_type_Private),1.0,0.598394,0.598394,0.598394,1.0,1.0,0.0,1.0,0.0,0.598394,0.0,0.799197
9,(stroke),(Residence_type_Rural),1.0,0.457831,0.457831,0.457831,1.0,1.0,0.0,1.0,0.0,0.457831,0.0,0.728916
10,(stroke),(Residence_type_Urban),1.0,0.542169,0.542169,0.542169,1.0,1.0,0.0,1.0,0.0,0.542169,0.0,0.771084
12,(stroke),(smoking_status_never smoked),1.0,0.365462,0.365462,0.365462,1.0,1.0,0.0,1.0,0.0,0.365462,0.0,0.682731
15,(stroke),(age_bin_senior),1.0,0.710843,0.710843,0.710843,1.0,1.0,0.0,1.0,0.0,0.710843,0.0,0.855422
16,(stroke),(avg_glucose_level_bin_low),1.0,0.35743,0.35743,0.35743,1.0,1.0,0.0,1.0,0.0,0.35743,0.0,0.678715
19,(stroke),(bmi_bin_overweight),1.0,0.461847,0.461847,0.461847,1.0,1.0,0.0,1.0,0.0,0.461847,0.0,0.730924
