In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("../data/processed/cleaned_data.csv", index_col=0)
print(df.shape)
df.head()

(3757, 23)


Unnamed: 0_level_0,LB.sign,RB.sign,age,age.diag,age.menarc,education,fpregnancy,gravity,hrt,hystrectomy,...,mens,no.biopsy,ocp,ocpuse,other.CA,personal.Hx,personal.other,reproduct,smoking,bmi
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,40~45,40~45,10~15,primary,15~20,1,1,0,...,regular,0.0,20~25,1,0,0,0,premenopouse,0,35~40
1,0,0,50~55,40~45,10~15,high school,20~25,1,1,0,...,regular,1.0,<20,0,1,1,0,premenopouse,0,25~30
1,0,0,75~80,65~70,10~15,illitrate,15~20,1,1,0,...,no,1.0,<20,0,1,1,0,postmenopouse,1,25~30
1,0,0,50~55,50~55,10~15,university,<5,0,1,0,...,no,0.0,<20,1,0,0,0,primenopouse,1,20~25
0,0,0,60~65,50~55,15~20,primary,15~20,1,1,0,...,no,1.0,<20,0,1,1,0,postmenopouse,0,25~30


In [3]:
df.columns

Index(['LB.sign', 'RB.sign', 'age', 'age.diag', 'age.menarc', 'education',
       'fpregnancy', 'gravity', 'hrt', 'hystrectomy', 'infertility',
       'lactaton', 'marital', 'mens', 'no.biopsy', 'ocp', 'ocpuse', 'other.CA',
       'personal.Hx', 'personal.other', 'reproduct', 'smoking', 'bmi'],
      dtype='object')

In [4]:
# Create dummy variables for each categorical variable
# df = pd.get_dummies(df, columns=['age.diag', 'age.menarc', 'age.menop', 'lactaton', 'fpregnancy', 'ocp', 'hrt', 'agef', 'weight', 'height'])
df = pd.get_dummies(df, columns=df.columns)
df.head()

Unnamed: 0_level_0,LB.sign_0,LB.sign_1,RB.sign_0,RB.sign_1,age_0~5,age_15~20,age_20~25,age_25~30,age_30~35,age_35~40,...,bmi_20~25,bmi_25~30,bmi_30~35,bmi_35~40,bmi_40~45,bmi_45~50,bmi_50~55,bmi_55~60,bmi_60+,bmi_<5
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# Use Apriori to identify frequent itemsets
frequent_itemsets = apriori(df, min_support=.6, use_colnames=True)



In [6]:
# Generate association rules using the frequent itemsets
association_rules = association_rules(frequent_itemsets, min_threshold=.7)
print(association_rules)

                     antecedents  \
0                    (RB.sign_0)   
1                    (LB.sign_0)   
2             (age.menarc_10~15)   
3                    (LB.sign_0)   
4                    (gravity_1)   
...                          ...   
3419  (gravity_1, infertility_0)   
3420  (infertility_0, smoking_0)   
3421      (gravity_1, smoking_0)   
3422                 (gravity_1)   
3423                 (smoking_0)   

                                            consequents  antecedent support  \
0                                           (LB.sign_0)            0.796114   
1                                           (RB.sign_0)            0.807825   
2                                           (LB.sign_0)            0.785467   
3                                    (age.menarc_10~15)            0.807825   
4                                           (LB.sign_0)            0.865318   
...                                                 ...                 ...   
3419  (hystrec

In [7]:
print(association_rules.shape)
association_rules.head()

(3424, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(RB.sign_0),(LB.sign_0),0.796114,0.807825,0.773223,0.971247,1.202298,0.130102,6.683654
1,(LB.sign_0),(RB.sign_0),0.807825,0.796114,0.773223,0.957166,1.202298,0.130102,4.759956
2,(age.menarc_10~15),(LB.sign_0),0.785467,0.807825,0.634815,0.808201,1.000464,0.000295,1.001956
3,(LB.sign_0),(age.menarc_10~15),0.807825,0.785467,0.634815,0.785832,1.000464,0.000295,1.001703
4,(gravity_1),(LB.sign_0),0.865318,0.807825,0.70189,0.811135,1.004097,0.002864,1.017524


In [8]:
# drop rows where antecedents contains less than 3 items
association_rules = association_rules[association_rules['antecedents'].apply(lambda x: len(x) >= 3)]
print(association_rules.shape)
association_rules.head()

(1205, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
806,"(RB.sign_0, age.menarc_10~15, personal.other_0)",(LB.sign_0),0.620176,0.807825,0.604205,0.974249,1.206014,0.103212,7.462781
807,"(RB.sign_0, LB.sign_0, personal.other_0)",(age.menarc_10~15),0.766835,0.785467,0.604205,0.787921,1.003124,0.001882,1.01157
808,"(RB.sign_0, age.menarc_10~15, LB.sign_0)",(personal.other_0),0.610061,0.99308,0.604205,0.990401,0.997303,-0.001634,0.720981
809,"(LB.sign_0, age.menarc_10~15, personal.other_0)",(RB.sign_0),0.628959,0.796114,0.604205,0.960643,1.206666,0.103482,5.18046
819,"(RB.sign_0, gravity_1, hrt_1)",(LB.sign_0),0.647857,0.807825,0.631887,0.975349,1.207376,0.108532,7.795883


In [9]:
association_rules.to_csv('../reports/association_rules.csv', index=False)