In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# == Data Exploration ==

print(df.columns)
# set(df['avg_glucose_level'])

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [3]:
# == Data Preprocessing ==

# Fill bmi with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 1544 patients have smoking_status as Unknown
# For patients with age < 10 we set smoking_status to 'never smoked'
df.loc[df['age'] < 10, 'smoking_status'] = 'never smoked'

df['hypertension'] = df['hypertension'].astype(bool)
df['heart_disease'] = df['heart_disease'].astype(bool)
df['stroke'] = df['stroke'].astype(bool)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,False,True,Yes,Private,Urban,228.69,36.6,formerly smoked,True
1,51676,Female,61.0,False,False,Yes,Self-employed,Rural,202.21,28.1,never smoked,True
2,31112,Male,80.0,False,True,Yes,Private,Rural,105.92,32.5,never smoked,True
3,60182,Female,49.0,False,False,Yes,Private,Urban,171.23,34.4,smokes,True
4,1665,Female,79.0,True,False,Yes,Self-employed,Rural,174.12,24.0,never smoked,True
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,True,False,Yes,Private,Urban,83.75,28.1,never smoked,False
5106,44873,Female,81.0,False,False,Yes,Self-employed,Urban,125.20,40.0,never smoked,False
5107,19723,Female,35.0,False,False,Yes,Self-employed,Rural,82.99,30.6,never smoked,False
5108,37544,Male,51.0,False,False,Yes,Private,Rural,166.29,25.6,formerly smoked,False


In [4]:
# == Preprocess for Apriori ==
df1 = df

# Binning
df1['age_bin'] = pd.cut(df1['age'], bins = [0, 18, 40, 60, 100], labels = ['child', 'young', 'middle_aged', 'senior'])
df1['avg_glucose_level_bin'] = pd.cut(df1['avg_glucose_level'], bins = [50, 90, 130,200,300], labels = ['low', 'normal', 'high', 'very_high'])
df1['bmi_bin'] = pd.cut(df1['bmi'], bins = [0, 18.5, 25, 30, 100], labels = ['underweight', 'normal', 'overweight', 'obese'])

df1 = df1.drop(columns=['age', 'avg_glucose_level', 'bmi'])

df1 = pd.get_dummies(df1)
df1 = df1.drop(columns='id')
df1.columns

Index(['hypertension', 'heart_disease', 'stroke', 'gender_Female',
       'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age_bin_child', 'age_bin_young',
       'age_bin_middle_aged', 'age_bin_senior', 'avg_glucose_level_bin_low',
       'avg_glucose_level_bin_normal', 'avg_glucose_level_bin_high',
       'avg_glucose_level_bin_very_high', 'bmi_bin_underweight',
       'bmi_bin_normal', 'bmi_bin_overweight', 'bmi_bin_obese'],
      dtype='object')

In [8]:
# == Apriori Algorithm ==
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df1, min_support=0.01, use_colnames=True)

maximal_itemsets = []
for itemset in frequent_itemsets['itemsets']:
    flag = True
    for other in frequent_itemsets['itemsets']:
        if itemset == other:
            continue
        if itemset.issubset(other):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].isin(maximal_itemsets)]
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
# maximal_frequent_itemsets

rules_consequents_stroke = rules[rules['consequents'] == frozenset({'stroke'})].sort_values(by='confidence', ascending=False)
rules_consequents_stroke[rules_consequents_stroke['confidence'] > 0.17]
# rules[rules['antecedents'] == frozenset({'stroke'})]
# rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
7682,"(Residence_type_Urban, bmi_bin_overweight, age...",(stroke),0.053229,0.048728,0.010176,0.191176,3.92334,1.0,0.007582,1.176118,0.787007,0.110874,0.149745,0.200006
386,"(hypertension, age_bin_senior)",(stroke),0.055186,0.048728,0.010176,0.184397,3.784215,1.0,0.007487,1.166342,0.778719,0.108559,0.142619,0.196616
