In [40]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

import plotly.express as px
import plotly.graph_objects as go

In [41]:
df = pd.read_excel("../data/raw/heart 20 features.xlsx", index_col=0)
print(df.shape)
df.head()

(50059, 19)


Unnamed: 0_level_0,DisRheumaticHeartAge,DisHeartDiseaseDx,DisHeartDiseaseAge,DisStrokeDx,DisStrokeAge,DisHypertensionDx,DisHypertensionAge,DisDiabetesDx,DisDiabetesAge,DisCOPDDx,DisCOPDAge,DisRenalFailureDx,DisRenalFailureAge,DisJaundiceDx,DisJaundiceAge,DisLiverDx,DisLiverAge,DisTBDx,DisTBAge
DisRheumaticHeartDx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2,98.0,2,98,2,98,2,98,2,98,2,98,2,98,2,98,2,98,2,98
2,98.0,1,58,2,98,1,59,2,98,2,98,2,98,2,98,2,98,2,98
2,98.0,1,41,2,98,2,98,2,98,2,98,2,98,2,98,2,98,2,98
2,98.0,2,98,2,98,2,98,2,98,2,98,2,98,2,98,2,98,2,98
2,98.0,1,72,2,98,1,72,2,98,2,98,2,98,2,98,2,98,2,98


In [42]:
df.columns 

Index(['DisRheumaticHeartAge', 'DisHeartDiseaseDx', 'DisHeartDiseaseAge',
       'DisStrokeDx', 'DisStrokeAge', 'DisHypertensionDx',
       'DisHypertensionAge', 'DisDiabetesDx', 'DisDiabetesAge', 'DisCOPDDx',
       'DisCOPDAge', 'DisRenalFailureDx', 'DisRenalFailureAge',
       'DisJaundiceDx', 'DisJaundiceAge', 'DisLiverDx', 'DisLiverAge',
       'DisTBDx', 'DisTBAge'],
      dtype='object')

In [43]:
# print count of unique values in each column
for col in df.columns:
    print(col, df[col].nunique())

DisRheumaticHeartAge 37
DisHeartDiseaseDx 2
DisHeartDiseaseAge 65
DisStrokeDx 2
DisStrokeAge 48
DisHypertensionDx 2
DisHypertensionAge 69
DisDiabetesDx 2
DisDiabetesAge 60
DisCOPDDx 2
DisCOPDAge 77
DisRenalFailureDx 2
DisRenalFailureAge 37
DisJaundiceDx 2
DisJaundiceAge 72
DisLiverDx 2
DisLiverAge 23
DisTBDx 3
DisTBAge 75


In [44]:
# Preprocess the data by converting any continuous variables into categorical variables
cols_to_convert = ['DisRheumaticHeartAge', 'DisHeartDiseaseDx', 'DisHeartDiseaseAge',
       'DisStrokeDx', 'DisStrokeAge', 'DisHypertensionDx',
       'DisHypertensionAge', 'DisDiabetesDx', 'DisDiabetesAge', 'DisCOPDDx',
       'DisCOPDAge', 'DisRenalFailureDx', 'DisRenalFailureAge',
       'DisJaundiceDx', 'DisJaundiceAge', 'DisLiverDx', 'DisLiverAge',
       'DisTBDx', 'DisTBAge']
for col in cols_to_convert:
    # print min and max values for each column
    print(col, df[col].min(), df[col].max())

DisRheumaticHeartAge 2.0 98.0
DisHeartDiseaseDx 1 2
DisHeartDiseaseAge 2 99
DisStrokeDx 1 2
DisStrokeAge 15 98
DisHypertensionDx 1 2
DisHypertensionAge 2 98
DisDiabetesDx 1 2
DisDiabetesAge 3 98
DisCOPDDx 1 2
DisCOPDAge 1 99
DisRenalFailureDx 1 2
DisRenalFailureAge 10 98
DisJaundiceDx 1 2
DisJaundiceAge 1 98
DisLiverDx 1 2
DisLiverAge 10 98
DisTBDx 0 2
DisTBAge 0 171


In [45]:
print(df['DisRheumaticHeartAge'].unique())
df['DisRheumaticHeartAge'] = pd.cut(df['DisRheumaticHeartAge'], bins=[0, 30, 40, 50, 60, 70, 80, 90], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['DisRheumaticHeartAge'].unique())

[98. 30. 20. 25. 12. 34. 31. 42. 22. 33. nan 60. 18. 27. 50. 11. 46. 32.
 53. 70. 40. 43. 15. 41. 10. 14. 35. 38.  2. 21. 26. 39. 49. 16.  7. 54.
 28. 23.]
[NaN, '<30', '30-40', '40-50', '50-60', '60-70']
Categories (7, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [46]:
print(df['DisHeartDiseaseDx'].unique())
df['DisHeartDiseaseDx'] = pd.cut(df['DisHeartDiseaseDx'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['DisHeartDiseaseDx'].unique())

[2 1]
['<10']
Categories (9, object): ['<10' < '10-20' < '20-30' < '30-40' ... '50-60' < '60-70' < '70-80' < '80+']


In [47]:
print(df['DisHeartDiseaseAge'].unique())
df['DisHeartDiseaseAge'] = pd.cut(df['DisHeartDiseaseAge'], bins=[0, 30, 40, 50, 60, 70, 80, 90], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['DisHeartDiseaseAge'].unique())

[98 58 41 72 54 55 43 50 39 52 45 53 34 56 46 63 38 40 60 57 69 61 59 64
 71 68 49 65 36 67 37 66 47 32 33 51 74 48 25 35 62 70 44 73 30 27 31 42
 28 19 24 75  6 23 99 29 14 18 16 20 15 17  2 22 78]
[NaN, '50-60', '40-50', '70-80', '30-40', '60-70', '<30']
Categories (7, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [48]:
print(df['DisStrokeDx'].unique())
df['DisStrokeDx'] = pd.cut(df['DisStrokeDx'], bins=[-1, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240], labels=['<15', '15-30', '30-45', '45-60', '60-75', '75-90', '90-105', '105-120', '120-135', '135-150', '150-165', '165-180', '180-195', '195-210', '210-225', '225+'])
print(df['DisStrokeDx'].unique())

[2 1]
['<15']
Categories (16, object): ['<15' < '15-30' < '30-45' < '45-60' ... '180-195' < '195-210' < '210-225' < '225+']


In [49]:
print(df['DisStrokeAge'].unique())
df['DisStrokeAge'] = pd.cut(df['DisStrokeAge'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['DisStrokeAge'].unique())

[98 58 59 47 48 52 56 66 53 68 54 42 57 55 62 64 46 51 72 73 43 39 44 40
 60 32 20 31 50 61 35 38 45 49 69 65 41 67 63 70 36 30 28 71 33 34 15 74]
[NaN, '50-60', '40-50', '60-70', '70-80', '30-40', '10-20', '20-30']
Categories (9, object): ['<10' < '10-20' < '20-30' < '30-40' ... '50-60' < '60-70' < '70-80' < '80+']


In [50]:
print(df['DisHypertensionDx'].unique())
df['DisHypertensionDx'] = pd.cut(df['DisHypertensionDx'], bins=[-1, 30, 40, 50, 60, 70, 80, 90, 120, 150, 180, 210, 240, 270, 300], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-120', '120-150', '150-180', '180-210', '210-240', '240-270', '270+'])
print(df['DisHypertensionDx'].unique())

[2 1]
['<30']
Categories (14, object): ['<30' < '30-40' < '40-50' < '50-60' ... '180-210' < '210-240' < '240-270' < '270+']


In [51]:
print(df['DisHypertensionAge'].unique())
df['DisHypertensionAge'] = pd.cut(df['DisHypertensionAge'], bins=[-1, 30, 40, 50, 60, 70, 80, 90, 100], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90+'])
print(df['DisHypertensionAge'].unique())

[98 59 72 43 40 46 52 55 45 42 73 56 35 41 34 70 50 60 63 38 37 44 36 69
 71 61 39 66 51 57 20 47 58 95 68 53 65 26 54 62 49 48 30 31 77 64  3 67
 27 29 32 33 74  6 25 23 16 28 24 75 22 18 21 19 76 15  5 78  2]
['90+', '50-60', '70-80', '40-50', '30-40', '60-70', '<30']
Categories (8, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80-90' < '90+']


In [52]:
print(df['DisDiabetesDx'].unique())
df['DisDiabetesDx'] = pd.cut(df['DisDiabetesDx'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['DisDiabetesDx'].unique())

[2 1]
['<20']
Categories (8, object): ['<20' < '20-30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [53]:
print(df['DisDiabetesAge'].unique())
df['DisDiabetesAge'] = pd.cut(df['DisDiabetesAge'], bins=[0, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100-110', '110+'])
print(df['DisDiabetesAge'].unique())

[98 40 41 32 53 68 50 45 72 39 28 46 43 52 38 48 49 35 44 71 42 57 69 75
 66 56 51 58 64 47 59 30 60 55 54 33 67 61 70 62 63 34 74 37 29 73 18 10
 36 65 20 27 22 31 15 26 23  3 25 17]
['90-100', '30-40', '40-50', '50-60', '60-70', '70-80', '<30']
Categories (10, object): ['<30' < '30-40' < '40-50' < '50-60' ... '80-90' < '90-100' < '100-110' < '110+']


In [54]:
print(df['DisCOPDDx'].unique())
df['DisCOPDDx'] = pd.cut(df['DisCOPDDx'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisCOPDDx'].unique())

[2 1]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [55]:
print(df['DisCOPDAge'].unique())
df['DisCOPDAge'] = pd.cut(df['DisCOPDAge'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisCOPDAge'].unique())

[98 25 30 20 35 32 52 41 43 39 51 15 44 58 50 34 53 59 45 76 49 56 47 57
 12 48 37 63  1 64 46 54 70 38 36 60 66 67 22 33 40 62 55 61 68 19 71 65
 72 10  7 23 14 42 73 26 31 29 27  8 74 16 24 18  5 75  3 69 21 13 28 17
 11  2  6 99  9]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [56]:
print(df['DisRenalFailureDx'].unique())
df['DisRenalFailureDx'] = pd.cut(df['DisRenalFailureDx'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisRenalFailureDx'].unique())

[2 1]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [57]:
print(df['DisRenalFailureAge'].unique())
df['DisRenalFailureAge'] = pd.cut(df['DisRenalFailureAge'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisRenalFailureAge'].unique())

[98 43 52 60 38 50 59 53 56 61 54 35 41 51 12 46 30 21 27 45 48 40 73 47
 34 36 28 62 10 22 26 25 39 57 44 67 37]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [58]:
print(df['DisJaundiceDx'].unique())
df['DisJaundiceDx'] = pd.cut(df['DisJaundiceDx'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisJaundiceDx'].unique())

[2 1]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [59]:
print(df['DisJaundiceAge'].unique())
df['DisJaundiceAge'] = pd.cut(df['DisJaundiceAge'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisJaundiceAge'].unique())

[98 27 28 34 54 20 50 63 41 35 38 30 58 33 56  8 60 40 25 55 26 18 19 44
 15 22 62  6 21 29 64 42 52 16  5 10 24 43 48 61 70 66 32 53 49 57 37 71
 45 46 13 69 23 47 39 51 65 31 11  3 36 67 68 17 59  7 73 12  9  1 14 72]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [60]:
print(df['DisLiverDx'].unique())
df['DisLiverDx'] = pd.cut(df['DisLiverDx'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisLiverDx'].unique())

[2 1]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [61]:
print(df['DisLiverAge'].unique())
df['DisLiverAge'] = pd.cut(df['DisLiverAge'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisLiverAge'].unique())

[98 50 40 53 47 39 35 15 43 56 42 44 55 30 17 38 37 48 36 29 10 33 49]
['<100']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [62]:
print(df['DisTBDx'].unique())
df['DisTBDx'] = pd.cut(df['DisTBDx'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisTBDx'].unique())

[2 1 0]
['<100', NaN]
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [63]:
print(df['DisTBAge'].unique())
df['DisTBAge'] = pd.cut(df['DisTBAge'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['DisTBAge'].unique())

[ 98  31  25  35  30  24  14   9  29  50  45  19  18  27  20   1  36  60
  17  41  65  49  34  12  37  32  38  51  53  22  57  63  15  13  21  42
  58  40  46  61  54  52  10  39  56  26  59  44  68  33   7  28  16  48
  47  55  23  43  69  70   6   3   8  67  71  74  11  72 171  64   0  62
  66   2   5]
['<100', '160-180', NaN]
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [64]:
df.head()

Unnamed: 0_level_0,DisRheumaticHeartAge,DisHeartDiseaseDx,DisHeartDiseaseAge,DisStrokeDx,DisStrokeAge,DisHypertensionDx,DisHypertensionAge,DisDiabetesDx,DisDiabetesAge,DisCOPDDx,DisCOPDAge,DisRenalFailureDx,DisRenalFailureAge,DisJaundiceDx,DisJaundiceAge,DisLiverDx,DisLiverAge,DisTBDx,DisTBAge
DisRheumaticHeartDx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2,,<10,,<15,,<30,90+,<20,90-100,<100,<100,<100,<100,<100,<100,<100,<100,<100,<100
2,,<10,50-60,<15,,<30,50-60,<20,90-100,<100,<100,<100,<100,<100,<100,<100,<100,<100,<100
2,,<10,40-50,<15,,<30,90+,<20,90-100,<100,<100,<100,<100,<100,<100,<100,<100,<100,<100
2,,<10,,<15,,<30,90+,<20,90-100,<100,<100,<100,<100,<100,<100,<100,<100,<100,<100
2,,<10,70-80,<15,,<30,70-80,<20,90-100,<100,<100,<100,<100,<100,<100,<100,<100,<100,<100


In [65]:
df.columns

Index(['DisRheumaticHeartAge', 'DisHeartDiseaseDx', 'DisHeartDiseaseAge',
       'DisStrokeDx', 'DisStrokeAge', 'DisHypertensionDx',
       'DisHypertensionAge', 'DisDiabetesDx', 'DisDiabetesAge', 'DisCOPDDx',
       'DisCOPDAge', 'DisRenalFailureDx', 'DisRenalFailureAge',
       'DisJaundiceDx', 'DisJaundiceAge', 'DisLiverDx', 'DisLiverAge',
       'DisTBDx', 'DisTBAge'],
      dtype='object')

In [66]:
# Create dummy variables for each categorical variable
# df = pd.get_dummies(df, columns=['age.diag', 'age.menarc', 'age.menop', 'lactaton', 'fpregnancy', 'ocp', 'hrt', 'agef', 'weight', 'height'])
df = pd.get_dummies(df, columns=df.columns)
df.head()

Unnamed: 0_level_0,DisRheumaticHeartAge_<30,DisRheumaticHeartAge_30-40,DisRheumaticHeartAge_40-50,DisRheumaticHeartAge_50-60,DisRheumaticHeartAge_60-70,DisRheumaticHeartAge_70-80,DisRheumaticHeartAge_80+,DisHeartDiseaseDx_<10,DisHeartDiseaseDx_10-20,DisHeartDiseaseDx_20-30,...,DisTBDx_160-180,DisTBDx_180-200,DisTBDx_200+,DisTBAge_<100,DisTBAge_100-120,DisTBAge_120-140,DisTBAge_140-160,DisTBAge_160-180,DisTBAge_180-200,DisTBAge_200+
DisRheumaticHeartDx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [67]:
# Use Apriori to identify frequent itemsets
frequent_itemsets = apriori(df, min_support=0.9, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,1.000000,(DisHeartDiseaseDx_<10)
1,1.000000,(DisStrokeDx_<15)
2,1.000000,(DisHypertensionDx_<30)
3,1.000000,(DisDiabetesDx_<20)
4,0.931221,(DisDiabetesAge_90-100)
...,...,...
32762,0.931141,"(DisJaundiceAge_<100, DisRenalFailureDx_<100, ..."
32763,0.931141,"(DisJaundiceAge_<100, DisRenalFailureDx_<100, ..."
32764,0.931141,"(DisJaundiceAge_<100, DisRenalFailureDx_<100, ..."
32765,0.931141,"(DisJaundiceAge_<100, DisRenalFailureDx_<100, ..."


In [68]:
# Generate association rules using the frequent itemsets
association_rules = association_rules(frequent_itemsets, min_threshold=.9)
print(association_rules)

In [30]:
print(association_rules.shape)
association_rules.head()

(4750202, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(DisHeartDiseaseDx_<10),(DisStrokeDx_<15),1.0,1.0,1.0,1.0,1.0,0.0,inf
1,(DisStrokeDx_<15),(DisHeartDiseaseDx_<10),1.0,1.0,1.0,1.0,1.0,0.0,inf
2,(DisHypertensionDx_<30),(DisHeartDiseaseDx_<10),1.0,1.0,1.0,1.0,1.0,0.0,inf
3,(DisHeartDiseaseDx_<10),(DisHypertensionDx_<30),1.0,1.0,1.0,1.0,1.0,0.0,inf
4,(DisDiabetesDx_<20),(DisHeartDiseaseDx_<10),1.0,1.0,1.0,1.0,1.0,0.0,inf


In [31]:
# drop rows where antecedents contains less than 3 items
association_rules = association_rules[association_rules['antecedents'].apply(lambda x: len(x) >= 3)]
print(association_rules.shape)
association_rules.head()

In [33]:
association_rules.to_csv('../content/association_rules.csv', index=False)