In [169]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax
from FIM import eclat

import plotly.express as px
import plotly.graph_objects as go

Docs: https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [170]:
df = pd.read_csv("../data/processed/cleaned_data.csv")
df = df.astype("int")
print(df.shape)
df.head()

(5896, 246)


Unnamed: 0,Family_0,Family_1,Family_2,Family_3,Family_4,Family_5,Family_6,Family_7,LB.sign_0,LB.sign_1,...,age_48_53,age_53_58,age_58_63,age_63_68,age_68_73,age_73_78,age_78_83,age_83_88,age_88_93,age_93_98
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0


## Association Rules

### Apriori

In [171]:
# Use Apriori to identify frequent itemsets
min_support = 0.7
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)



In [172]:
print(frequent_itemsets.shape)
frequent_itemsets

(143, 2)


Unnamed: 0,support,itemsets
0,0.830224,(Family_0)
1,0.868046,(LB.sign_0)
2,0.873474,(RB.sign_0)
3,0.846336,(gravity_1)
4,0.948779,(hrt_0)
...,...,...
138,0.767809,"(smoking_0, personal.other_0, infertility_0, h..."
139,0.718962,"(smoking_0, personal.other_0, hystrectomy_0, i..."
140,0.711669,"(RB.sign_0, personal.other_0, infertility_0, h..."
141,0.712347,"(personal.other_0, hystrectomy_0, infertility_..."


In [173]:
# Generate association rules using the frequent itemsets
min_confidence = 0.9
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Family_0),(hrt_0),0.830224,0.948779,0.790536,0.952196,1.003602,0.002837,1.071486
1,(Family_0),(infertility_0),0.830224,0.966757,0.802408,0.966496,0.999730,-0.000216,0.992219
2,(Family_0),(personal.other_0),0.830224,0.991011,0.824457,0.993054,1.002062,0.001696,1.294173
3,(RB.sign_0),(LB.sign_0),0.873474,0.868046,0.788840,0.903107,1.040390,0.030625,1.361849
4,(LB.sign_0),(RB.sign_0),0.868046,0.873474,0.788840,0.908753,1.040390,0.030625,1.386643
...,...,...,...,...,...,...,...,...,...
312,"(RB.sign_0, hystrectomy_0, infertility_0, hrt_0)",(personal.other_0),0.722863,0.991011,0.718284,0.993665,1.002678,0.001919,1.418953
313,"(RB.sign_0, personal.other_0, hystrectomy_0)","(infertility_0, hrt_0)",0.769505,0.920794,0.718284,0.933436,1.013730,0.009728,1.189930
314,"(RB.sign_0, hystrectomy_0, infertility_0)","(personal.other_0, hrt_0)",0.748813,0.940807,0.718284,0.959230,1.019582,0.013795,1.451865
315,"(RB.sign_0, hystrectomy_0, hrt_0)","(personal.other_0, infertility_0)",0.744573,0.959125,0.718284,0.964692,1.005805,0.004146,1.157690


In [174]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Family_0),(hrt_0),0.830224,0.948779,0.790536,0.952196,1.003602,0.002837,1.071486
1,(Family_0),(infertility_0),0.830224,0.966757,0.802408,0.966496,0.99973,-0.000216,0.992219
2,(Family_0),(personal.other_0),0.830224,0.991011,0.824457,0.993054,1.002062,0.001696,1.294173
3,(RB.sign_0),(LB.sign_0),0.873474,0.868046,0.78884,0.903107,1.04039,0.030625,1.361849
4,(LB.sign_0),(RB.sign_0),0.868046,0.873474,0.78884,0.908753,1.04039,0.030625,1.386643


In [175]:
# drop rows where antecedents contains less than 3 items
rules = rules[rules['antecedents'].apply(lambda x: len(x) >= 2)]
print(rules.shape)
rules.head()

(245, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,"(Family_0, LB.sign_0)",(personal.other_0),0.720149,0.991011,0.7154,0.993406,1.002416,0.001725,1.36314
37,"(Family_0, RB.sign_0)",(infertility_0),0.725407,0.966757,0.701153,0.966565,0.999802,-0.000139,0.994264
38,"(Family_0, RB.sign_0)",(personal.other_0),0.725407,0.991011,0.720828,0.993687,1.002701,0.001941,1.423947
39,"(Family_0, hystrectomy_0)",(hrt_0),0.746269,0.948779,0.71998,0.964773,1.016857,0.011936,1.45402
40,"(Family_0, hrt_0)",(hystrectomy_0),0.790536,0.893996,0.71998,0.910749,1.018739,0.013244,1.187704


In [176]:
rules.to_csv('../reports/arm_apriori.csv', index=False)

### FP Growth

In [177]:
min_support = 0.7
frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)



In [178]:
print(frequent_itemsets.shape)
frequent_itemsets

(143, 2)


Unnamed: 0,support,itemsets
0,0.991011,(personal.other_0)
1,0.966757,(infertility_0)
2,0.948779,(hrt_0)
3,0.893996,(hystrectomy_0)
4,0.873474,(RB.sign_0)
...,...,...
138,0.740332,"(hystrectomy_0, LB.sign_0, hrt_0)"
139,0.737280,"(infertility_0, personal.other_0, hystrectomy_..."
140,0.735414,"(personal.other_0, hystrectomy_0, LB.sign_0, h..."
141,0.717096,"(infertility_0, hystrectomy_0, LB.sign_0, hrt_0)"


In [179]:
# Generate association rules using the frequent itemsets
min_confidence = 0.9
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(personal.other_0),(infertility_0),0.991011,0.966757,0.959125,0.967825,1.001104,0.001058,1.033182
1,(infertility_0),(personal.other_0),0.966757,0.991011,0.959125,0.992105,1.001104,0.001058,1.138625
2,(personal.other_0),(hrt_0),0.991011,0.948779,0.940807,0.949341,1.000593,0.000557,1.011099
3,(hrt_0),(personal.other_0),0.948779,0.991011,0.940807,0.991598,1.000593,0.000557,1.069900
4,(infertility_0),(hrt_0),0.966757,0.948779,0.920794,0.952456,1.003876,0.003555,1.077346
...,...,...,...,...,...,...,...,...,...
312,"(LB.sign_0, hystrectomy_0, infertility_0, hrt_0)",(personal.other_0),0.717096,0.991011,0.712347,0.993377,1.002388,0.001697,1.357361
313,"(personal.other_0, hystrectomy_0, LB.sign_0)","(infertility_0, hrt_0)",0.764756,0.920794,0.712347,0.931470,1.011595,0.008165,1.155796
314,"(LB.sign_0, hystrectomy_0, infertility_0)","(personal.other_0, hrt_0)",0.742877,0.940807,0.712347,0.958904,1.019235,0.013444,1.440355
315,"(LB.sign_0, hystrectomy_0, hrt_0)","(personal.other_0, infertility_0)",0.740332,0.959125,0.712347,0.962199,1.003206,0.002276,1.081334


In [180]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(personal.other_0),(infertility_0),0.991011,0.966757,0.959125,0.967825,1.001104,0.001058,1.033182
1,(infertility_0),(personal.other_0),0.966757,0.991011,0.959125,0.992105,1.001104,0.001058,1.138625
2,(personal.other_0),(hrt_0),0.991011,0.948779,0.940807,0.949341,1.000593,0.000557,1.011099
3,(hrt_0),(personal.other_0),0.948779,0.991011,0.940807,0.991598,1.000593,0.000557,1.0699
4,(infertility_0),(hrt_0),0.966757,0.948779,0.920794,0.952456,1.003876,0.003555,1.077346


In [181]:
# drop rows where antecedents contains less than 3 items
rules = rules[rules['antecedents'].apply(lambda x: len(x) >= 2)]
print(rules.shape)
rules.head()

(245, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
6,"(personal.other_0, infertility_0)",(hrt_0),0.959125,0.948779,0.914009,0.952962,1.004409,0.004012,1.088931
7,"(personal.other_0, hrt_0)",(infertility_0),0.940807,0.966757,0.914009,0.971516,1.004923,0.004477,1.167077
8,"(infertility_0, hrt_0)",(personal.other_0),0.920794,0.991011,0.914009,0.992632,1.001636,0.001493,1.220052
16,"(personal.other_0, hystrectomy_0)",(infertility_0),0.887042,0.966757,0.857022,0.966157,0.999379,-0.000533,0.982261
17,"(hystrectomy_0, infertility_0)",(personal.other_0),0.862619,0.991011,0.857022,0.993512,1.002523,0.002157,1.385418


In [182]:
rules.to_csv('../reports/arm_fpgrowth.csv', index=False)

### FP Max

In [183]:
min_support = 0.7
frequent_itemsets = fpmax(df, min_support=min_support, use_colnames=True)



In [184]:
print(frequent_itemsets.shape)
frequent_itemsets

(27, 2)


Unnamed: 0,support,itemsets
0,0.721506,"(personal.Hx_no, personal.other_0)"
1,0.713704,"(other.CA_0, personal.other_0, infertility_0, ..."
2,0.704206,"(gravity_1, marital_2, infertility_0)"
3,0.718284,"(gravity_1, marital_2, personal.other_0)"
4,0.718962,"(marital_2, personal.other_0, hystrectomy_0)"
5,0.734905,"(marital_2, personal.other_0, infertility_0, h..."
6,0.7154,"(Family_0, personal.other_0, LB.sign_0)"
7,0.701153,"(Family_0, RB.sign_0, infertility_0)"
8,0.720828,"(Family_0, RB.sign_0, personal.other_0)"
9,0.71557,"(Family_0, personal.other_0, hystrectomy_0, in..."


In [185]:
frequent_itemsets.to_csv('../reports/arm_fpmax.csv', index=False)

In [186]:
# Generate association rules using the frequent itemsets
min_confidence = 0.9
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence, support_only=True)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [187]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [188]:
# drop rows where antecedents contains less than 2 items
rules = rules[rules['antecedents'].apply(lambda x: len(x) >= 2)]
print(rules.shape)
rules.head()

(0, 0)


In [189]:
# rules.to_csv('../reports/arm_fpmax.csv', index=False)

### ECLAT

In [190]:
frequent_itemsets = eclat(df, min_support=0.6)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.830224,(Family_0)
1,0.720149,"(Family_0, LB.sign_0)"
2,0.654681,"(Family_0, RB.sign_0, LB.sign_0)"
3,0.61652,"(Family_0, RB.sign_0, LB.sign_0, hrt_0)"
4,0.61211,"(RB.sign_0, personal.other_0, Family_0, hrt_0,..."
...,...,...
455,0.600237,"(personal.Hx_no, smoking_0, personal.other_0)"
456,0.603799,"(personal.Hx_no, smoking_0)"
457,0.991011,(personal.other_0)
458,0.831581,"(smoking_0, personal.other_0)"


In [191]:
# Generate association rules using the frequent itemsets
min_confidence = 0.9
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Family_0, RB.sign_0)",(LB.sign_0),0.725407,0.868046,0.654681,0.902502,1.039693,0.024994,1.353397
1,"(Family_0, LB.sign_0)",(RB.sign_0),0.720149,0.873474,0.654681,0.909091,1.040777,0.025650,1.391791
2,"(Family_0, RB.sign_0, LB.sign_0)",(hrt_0),0.654681,0.948779,0.616520,0.941710,0.992549,-0.004628,0.878728
3,"(Family_0, LB.sign_0, hrt_0)",(RB.sign_0),0.681140,0.873474,0.616520,0.905129,1.036241,0.021562,1.333675
4,"(Family_0, RB.sign_0, personal.other_0, LB.sig...",(hrt_0),0.650102,0.948779,0.612110,0.941560,0.992392,-0.004693,0.876476
...,...,...,...,...,...,...,...,...,...
1152,(other.CA_0),(personal.other_0),0.766621,0.991011,0.761533,0.993363,1.002373,0.001803,1.354365
1153,"(smoking_0, other.CA_0)",(personal.other_0),0.657564,0.991011,0.652815,0.992778,1.001783,0.001162,1.244676
1154,(personal.Hx_no),(personal.other_0),0.725407,0.991011,0.721506,0.994622,1.003644,0.002620,1.671590
1155,"(personal.Hx_no, smoking_0)",(personal.other_0),0.603799,0.991011,0.600237,0.994101,1.003118,0.001866,1.523874


In [192]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Family_0, RB.sign_0)",(LB.sign_0),0.725407,0.868046,0.654681,0.902502,1.039693,0.024994,1.353397
1,"(Family_0, LB.sign_0)",(RB.sign_0),0.720149,0.873474,0.654681,0.909091,1.040777,0.02565,1.391791
2,"(Family_0, RB.sign_0, LB.sign_0)",(hrt_0),0.654681,0.948779,0.61652,0.94171,0.992549,-0.004628,0.878728
3,"(Family_0, LB.sign_0, hrt_0)",(RB.sign_0),0.68114,0.873474,0.61652,0.905129,1.036241,0.021562,1.333675
4,"(Family_0, RB.sign_0, personal.other_0, LB.sig...",(hrt_0),0.650102,0.948779,0.61211,0.94156,0.992392,-0.004693,0.876476


In [193]:
# drop rows where antecedents contains less than 3 items
rules = rules[rules['antecedents'].apply(lambda x: len(x) >= 2)]
print(rules.shape)
rules.head()

(1072, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Family_0, RB.sign_0)",(LB.sign_0),0.725407,0.868046,0.654681,0.902502,1.039693,0.024994,1.353397
1,"(Family_0, LB.sign_0)",(RB.sign_0),0.720149,0.873474,0.654681,0.909091,1.040777,0.02565,1.391791
2,"(Family_0, RB.sign_0, LB.sign_0)",(hrt_0),0.654681,0.948779,0.61652,0.94171,0.992549,-0.004628,0.878728
3,"(Family_0, LB.sign_0, hrt_0)",(RB.sign_0),0.68114,0.873474,0.61652,0.905129,1.036241,0.021562,1.333675
4,"(Family_0, RB.sign_0, personal.other_0, LB.sig...",(hrt_0),0.650102,0.948779,0.61211,0.94156,0.992392,-0.004693,0.876476


In [194]:
rules.to_csv('../reports/arm_eclat.csv', index=False)