In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# == Data Exploration ==

print(df.columns)
# set(df['avg_glucose_level'])

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [5]:
# == Data Preprocessing ==

# Fill bmi with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 1544 patients have smoking_status as Unknown
# For patients with age < 10 we set smoking_status to 'never smoked'
df.loc[df['age'] < 10, 'smoking_status'] = 'never smoked'

df['hypertension'] = df['hypertension'].astype(bool)
df['heart_disease'] = df['heart_disease'].astype(bool)
df['stroke'] = df['stroke'].astype(bool)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,False,True,Yes,Private,Urban,228.69,36.6,formerly smoked,True
1,51676,Female,61.0,False,False,Yes,Self-employed,Rural,202.21,28.1,never smoked,True
2,31112,Male,80.0,False,True,Yes,Private,Rural,105.92,32.5,never smoked,True
3,60182,Female,49.0,False,False,Yes,Private,Urban,171.23,34.4,smokes,True
4,1665,Female,79.0,True,False,Yes,Self-employed,Rural,174.12,24.0,never smoked,True
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,True,False,Yes,Private,Urban,83.75,28.1,never smoked,False
5106,44873,Female,81.0,False,False,Yes,Self-employed,Urban,125.20,40.0,never smoked,False
5107,19723,Female,35.0,False,False,Yes,Self-employed,Rural,82.99,30.6,never smoked,False
5108,37544,Male,51.0,False,False,Yes,Private,Rural,166.29,25.6,formerly smoked,False


In [6]:
# == Preprocess for Apriori ==
df1 = df

# Binning
df1['age_bin'] = pd.cut(df1['age'], bins = [0, 18, 40, 60, 100], labels = ['child', 'young', 'middle_aged', 'senior'])
df1['avg_glucose_level_bin'] = pd.cut(df1['avg_glucose_level'], bins = [50, 90, 130,200,300], labels = ['low', 'normal', 'high', 'very_high'])
df1['bmi_bin'] = pd.cut(df1['bmi'], bins = [0, 18.5, 25, 30, 100], labels = ['underweight', 'normal', 'overweight', 'obese'])

df1 = df1.drop(columns=['age', 'avg_glucose_level', 'bmi'])

df1 = pd.get_dummies(df1)
df1 = df1.drop(columns='id')
df1.columns

Index(['hypertension', 'heart_disease', 'stroke', 'gender_Female',
       'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age_bin_child', 'age_bin_young',
       'age_bin_middle_aged', 'age_bin_senior', 'avg_glucose_level_bin_low',
       'avg_glucose_level_bin_normal', 'avg_glucose_level_bin_high',
       'avg_glucose_level_bin_very_high', 'bmi_bin_underweight',
       'bmi_bin_normal', 'bmi_bin_overweight', 'bmi_bin_obese'],
      dtype='object')

In [7]:
# == Apriori Algorithm ==
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df1, min_support=0.01, use_colnames=True)

maximal_itemsets = []
for itemset in frequent_itemsets['itemsets']:
    flag = True
    for other in frequent_itemsets['itemsets']:
        if itemset == other:
            continue
        if itemset.issubset(other):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(maximal_itemsets)]
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
maximal_frequent_itemsets
rules[rules['consequents'] == frozenset({'stroke'})].sort_values(by='confidence', ascending=False)
# rules[rules['antecedents'] == frozenset({'stroke'})]
# rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
7684,"(Residence_type_Urban, bmi_bin_overweight, age...",(stroke),0.053229,0.048728,0.010176,0.191176,3.923340,1.0,0.007582,1.176118,0.787007,0.110874,0.149745,0.200006
388,"(age_bin_senior, hypertension)",(stroke),0.055186,0.048728,0.010176,0.184397,3.784215,1.0,0.007487,1.166342,0.778719,0.108559,0.142619,0.196616
1188,"(bmi_bin_overweight, age_bin_senior)",(stroke),0.105675,0.048728,0.016634,0.157407,3.230329,1.0,0.011485,1.128982,0.772017,0.120739,0.114246,0.249386
7671,"(Residence_type_Urban, work_type_Private, age_...",(stroke),0.067515,0.048728,0.010568,0.156522,3.212153,1.0,0.007278,1.127797,0.738545,0.100000,0.113315,0.186695
7648,"(ever_married_Yes, bmi_bin_overweight, age_bin...",(stroke),0.099804,0.048728,0.014873,0.149020,3.058194,1.0,0.010010,1.117854,0.747626,0.111274,0.105429,0.227120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,(work_type_Private),(stroke),0.572407,0.048728,0.029159,0.050940,1.045399,1.0,0.001266,1.002331,0.101562,0.049256,0.002326,0.324667
73,(bmi_bin_obese),(stroke),0.370450,0.048728,0.018787,0.050713,1.040740,1.0,0.000735,1.002091,0.062179,0.046921,0.002087,0.218128
1016,"(Residence_type_Urban, gender_Female)",(stroke),0.299217,0.048728,0.015068,0.050360,1.033486,1.0,0.000488,1.001718,0.046236,0.045267,0.001715,0.179798
1106,"(ever_married_Yes, avg_glucose_level_bin_normal)",(stroke),0.218591,0.048728,0.010763,0.049239,1.010488,1.0,0.000112,1.000538,0.013282,0.041953,0.000537,0.135061
