In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("../data/processed/cleaned_data.csv", index_col=0)
print(df.shape)
df.head()

(445, 23)


Unnamed: 0,CODE,age.diag,age.menarc,age.menop,education,marital,gravity,lactaton,fpregnancy,infertility,...,hrt,smoking,personal.Hx,Family,agef,weight,height,lifeevent,stage,G
1,86002380,41,12,47,3,2,1,27,20,0,...,0,0,1,4,38,69,158,0,3a,2
15,3,53,16,50,2,2,1,52,19,0,...,0,0,1,0,44,70,155,0,1,2
22,6000094,41,14,47,3,2,1,0,21,0,...,0,0,1,0,44,63,157,0,3c,2
26,6000178,40,14,47,3,2,1,19,22,0,...,0,0,0,0,44,63,154,0,2a,1
28,6000211,38,11,47,3,2,1,18,22,0,...,0,0,1,1,44,53,150,0,2b,2


In [3]:
# print count of unique values in each column
for col in df.columns:
    print(col, df[col].nunique())

CODE 443
age.diag 52
age.menarc 10
age.menop 24
education 4
marital 3
gravity 3
lactaton 74
fpregnancy 25
infertility 2
reproduct 4
ocpuse 2
ocp 44
hrt 17
smoking 2
personal.Hx 2
Family 7
agef 16
weight 58
height 38
lifeevent 1
stage 8
G 3


In [4]:
# Preprocess the data by converting any continuous variables into categorical variables
cols_to_convert = ['age.diag', 'age.menarc', 'age.menop', 'lactaton', 'fpregnancy', 'ocp', 'hrt', 'agef', 'weight', 'height']
for col in cols_to_convert:
    # print min and max values for each column
    print(col, df[col].min(), df[col].max())

age.diag 24 81
age.menarc 10 20
age.menop 32 62
lactaton 0 240
fpregnancy 14 42
ocp 0 300
hrt 0 96
agef 24 80
weight 36 115
height 137 178


In [5]:
print(df['age.diag'].unique())
df['age.diag'] = pd.cut(df['age.diag'], bins=[0, 30, 40, 50, 60, 70, 80, 90], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['age.diag'].unique())

[41 53 40 38 32 27 46 48 52 30 39 49 45 64 44 63 28 51 58 47 61 43 29 42
 54 66 50 36 31 56 68 55 24 70 57 26 34 59 33 35 65 78 60 37 71 72 62 69
 76 74 73 81]
['40-50', '50-60', '30-40', '<30', '60-70', '70-80', '80+']
Categories (7, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [6]:
print(df['age.menarc'].unique())
df['age.menarc'] = pd.cut(df['age.menarc'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['age.menarc'].unique())

[12 16 14 11 13 15 20 10 17 18]
['10-20', '<10']
Categories (9, object): ['<10' < '10-20' < '20-30' < '30-40' ... '50-60' < '60-70' < '70-80' < '80+']


In [7]:
print(df['age.menop'].unique())
df['age.menop'] = pd.cut(df['age.menop'], bins=[0, 30, 40, 50, 60, 70, 80, 90], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['age.menop'].unique())

[47 50 45 48 52 46 40 44 39 49 42 51 54 53 55 43 32 41 36 38 62 60 35 37]
['40-50', '50-60', '30-40', '60-70']
Categories (7, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [8]:
print(df['lactaton'].unique())
df['lactaton'] = pd.cut(df['lactaton'], bins=[-1, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240], labels=['<15', '15-30', '30-45', '45-60', '60-75', '75-90', '90-105', '105-120', '120-135', '135-150', '150-165', '165-180', '180-195', '195-210', '210-225', '225+'])
print(df['lactaton'].unique())

[ 27  52   0  19  18   9  53  48  12   4  15  36  42 216  66 192 120  54
   5  50  10   1  24  96   3  16  72  40  20 174  30  80  89  84   2 108
  55 144 102  33  44   6  51  58  28  61  43  17  21  60   7 118  38  45
  46 180   8 168 100  22 240  13  26 132 113  78  32  68  25  39  41  90
  14  35]
['15-30', '45-60', '<15', '30-45', '210-225', ..., '165-180', '75-90', '135-150', '225+', '120-135']
Length: 14
Categories (16, object): ['<15' < '15-30' < '30-45' < '45-60' ... '180-195' < '195-210' < '210-225' < '225+']


In [9]:
print(df['fpregnancy'].unique())
df['fpregnancy'] = pd.cut(df['fpregnancy'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['fpregnancy'].unique())

[20 19 21 22 18 34 17 30 16 27 23 15 24 25 14 28 29 31 26 32 40 35 41 42
 33]
['10-20', '20-30', '30-40', '40-50']
Categories (9, object): ['<10' < '10-20' < '20-30' < '30-40' ... '50-60' < '60-70' < '70-80' < '80+']


In [10]:
print(df['ocp'].unique())
df['ocp'] = pd.cut(df['ocp'], bins=[-1, 30, 40, 50, 60, 70, 80, 90, 120, 150, 180, 210, 240, 270, 300], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-120', '120-150', '150-180', '180-210', '210-240', '240-270', '270+'])
print(df['ocp'].unique())

[ 61  12 300   3  30  72 168   4 120  36  66  96   6   8  60 132 108   0
  84   7 216 228  18   1 240  24 276 204  42   5   2 144  48  54 288 192
  14  90  10 180  41  73  45  78]
['60-70', '<30', '270+', '70-80', '150-180', ..., '120-150', '80-90', '210-240', '180-210', '40-50']
Length: 13
Categories (14, object): ['<30' < '30-40' < '40-50' < '50-60' ... '180-210' < '210-240' < '240-270' < '270+']


In [11]:
print(df['hrt'].unique())
df['hrt'] = pd.cut(df['hrt'], bins=[-1, 30, 40, 50, 60, 70, 80, 90, 100], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90+'])
print(df['hrt'].unique())

[ 0  2  6  1 24 96 36  3 48  9  5 30  8 12 60  4 54]
['<30', '90+', '30-40', '40-50', '50-60']
Categories (8, object): ['<30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80-90' < '90+']


In [12]:
print(df['agef'].unique())
df['agef'] = pd.cut(df['agef'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90], labels=['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'])
print(df['agef'].unique())

[38 44 32 45 60 29 30 35 41 40 24 55 80 46 65 42]
['30-40', '40-50', '50-60', '20-30', '70-80', '60-70']
Categories (8, object): ['<20' < '20-30' < '30-40' < '40-50' < '50-60' < '60-70' < '70-80' < '80+']


In [13]:
print(df['weight'].unique())
df['weight'] = pd.cut(df['weight'], bins=[0, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], labels=['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100-110', '110+'])
print(df['weight'].unique())

[ 69  70  63  53  60  68  75  59  57  52  48  54  72  55 100  65  61  90
  95  78  62  64  67  76  58  73  74  80  93  66  79  71  81  83  50  88
  36  41  85  86 105  77  45  44  84  96  82  92  99  89  87  56  42  43
 108  49 115  98]
['60-70', '50-60', '70-80', '40-50', '90-100', '80-90', '30-40', '100-110', '110+']
Categories (10, object): ['<30' < '30-40' < '40-50' < '50-60' ... '80-90' < '90-100' < '100-110' < '110+']


In [14]:
print(df['height'].unique())
df['height'] = pd.cut(df['height'], bins=[0, 100, 120, 140, 160, 180, 200, 220], labels=['<100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+'])
print(df['height'].unique())

[158 155 157 154 150 166 167 160 153 168 159 152 156 165 163 148 162 149
 164 145 151 143 161 147 178 142 171 172 174 140 170 146 169 173 141 138
 137 175]
['140-160', '160-180', '120-140']
Categories (7, object): ['<100' < '100-120' < '120-140' < '140-160' < '160-180' < '180-200' < '200+']


In [15]:
# drop CODE column
df.drop(columns=['CODE'], inplace=True)

In [16]:
df.head()

Unnamed: 0,age.diag,age.menarc,age.menop,education,marital,gravity,lactaton,fpregnancy,infertility,reproduct,...,hrt,smoking,personal.Hx,Family,agef,weight,height,lifeevent,stage,G
1,40-50,10-20,40-50,3,2,1,15-30,10-20,0,1,...,<30,0,1,4,30-40,60-70,140-160,0,3a,2
15,50-60,10-20,40-50,2,2,1,45-60,10-20,0,3,...,<30,0,1,0,40-50,60-70,140-160,0,1,2
22,40-50,10-20,40-50,3,2,1,<15,20-30,0,2,...,<30,0,1,0,40-50,60-70,140-160,0,3c,2
26,30-40,10-20,40-50,3,2,1,15-30,20-30,0,1,...,<30,0,0,0,40-50,60-70,140-160,0,2a,1
28,30-40,10-20,40-50,3,2,1,15-30,20-30,0,1,...,<30,0,1,1,40-50,50-60,140-160,0,2b,2


In [17]:
df.columns

Index(['age.diag', 'age.menarc', 'age.menop', 'education', 'marital',
       'gravity', 'lactaton', 'fpregnancy', 'infertility', 'reproduct',
       'ocpuse', 'ocp', 'hrt', 'smoking', 'personal.Hx', 'Family', 'agef',
       'weight', 'height', 'lifeevent', 'stage', 'G'],
      dtype='object')

In [18]:
# Create dummy variables for each categorical variable
# df = pd.get_dummies(df, columns=['age.diag', 'age.menarc', 'age.menop', 'lactaton', 'fpregnancy', 'ocp', 'hrt', 'agef', 'weight', 'height'])
df = pd.get_dummies(df, columns=df.columns)
df.head()

Unnamed: 0,age.diag_<30,age.diag_30-40,age.diag_40-50,age.diag_50-60,age.diag_60-70,age.diag_70-80,age.diag_80+,age.menarc_<10,age.menarc_10-20,age.menarc_20-30,...,stage_1,stage_2a,stage_2b,stage_3a,stage_3b,stage_3c,stage_4,G_1,G_2,G_3
1,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
15,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
22,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
26,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
28,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [19]:
# Use Apriori to identify frequent itemsets
frequent_itemsets = apriori(df, min_support=.6, use_colnames=True)



In [20]:
# Generate association rules using the frequent itemsets
association_rules = association_rules(frequent_itemsets, min_threshold=.7)
print(association_rules)

                 antecedents  \
0         (age.menarc_10-20)   
1          (age.menop_40-50)   
2         (age.menarc_10-20)   
3                (marital_2)   
4         (age.menarc_10-20)   
...                      ...   
45645   (hrt_<30, marital_2)   
45646    (hrt_<30, Family_0)   
45647  (marital_2, Family_0)   
45648            (marital_2)   
45649             (Family_0)   

                                             consequents  antecedent support  \
0                                      (age.menop_40-50)            0.986517   
1                                     (age.menarc_10-20)            0.871910   
2                                            (marital_2)            0.986517   
3                                     (age.menarc_10-20)            0.831461   
4                                            (gravity_1)            0.986517   
...                                                  ...                 ...   
45645  (agef_40-50, age.menarc_10-20, infertility_0, ..

In [24]:
print(association_rules.shape)
association_rules.head()

(45650, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age.menarc_10-20),(age.menop_40-50),0.986517,0.87191,0.860674,0.872437,1.000605,0.00052,1.004133
1,(age.menop_40-50),(age.menarc_10-20),0.87191,0.986517,0.860674,0.987113,1.000605,0.00052,1.046292
2,(age.menarc_10-20),(marital_2),0.986517,0.831461,0.820225,0.831435,0.999969,-2.5e-05,0.999848
3,(marital_2),(age.menarc_10-20),0.831461,0.986517,0.820225,0.986486,0.999969,-2.5e-05,0.997753
4,(age.menarc_10-20),(gravity_1),0.986517,0.937079,0.925843,0.938497,1.001513,0.001399,1.023055


In [25]:
# drop rows where antecedents contains less than 3 items
association_rules = association_rules[association_rules['antecedents'].apply(lambda x: len(x) >= 3)]
print(association_rules.shape)
association_rules.head()

(31458, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1039,"(age.menop_40-50, age.menarc_10-20, marital_2)",(gravity_1),0.734831,0.937079,0.689888,0.938838,1.001877,0.001293,1.028764
1040,"(age.menarc_10-20, marital_2, gravity_1)",(age.menop_40-50),0.770787,0.87191,0.689888,0.895044,1.026532,0.017831,1.220412
1041,"(age.menarc_10-20, age.menop_40-50, gravity_1)",(marital_2),0.806742,0.831461,0.689888,0.855153,1.028495,0.019114,1.16357
1042,"(age.menop_40-50, marital_2, gravity_1)",(age.menarc_10-20),0.698876,0.986517,0.689888,0.987138,1.00063,0.000434,1.048315
1052,"(age.menarc_10-20, infertility_0, marital_2)",(age.menop_40-50),0.806742,0.87191,0.721348,0.89415,1.025508,0.017942,1.210112


In [26]:
association_rules.to_csv('../reports/association_rules.csv', index=False)