In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

bunch = fetch_openml(data_id = 183)

to_be_discretized = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight']
columns = []
for column in bunch.frame.columns:
    col = bunch.frame[column]
    if column in to_be_discretized:
        col = pd.cut(col, 5, labels = range(5))
        
    col = pd.get_dummies(col, prefix = column)
    columns.append(col)

frame = pd.concat(columns, axis=1)
train, test = train_test_split(frame, random_state = 0)
print (len(frame), len(train), len(test))

4177 3132 1045


In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules


frequent_itemsets = fpgrowth(train, use_colnames=True, min_support = 0.25)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Shucked_weight_0),(Height_0),0.436143,0.994253,0.436143,1.000000,1.005780,0.002507,inf
1,(Viscera_weight_0),(Height_0),0.431673,0.994253,0.431354,0.999260,1.005036,0.002162,7.770115
2,(Shucked_weight_0),(Viscera_weight_0),0.436143,0.431673,0.392401,0.899707,2.084233,0.204130,5.666676
3,(Viscera_weight_0),(Shucked_weight_0),0.431673,0.436143,0.392401,0.909024,2.084233,0.204130,6.197842
4,"(Height_0, Shucked_weight_0)",(Viscera_weight_0),0.436143,0.431673,0.392401,0.899707,2.084233,0.204130,5.666676
...,...,...,...,...,...,...,...,...,...
725,"(Length_3, Diameter_3)","(Height_0, Shell_weight_1)",0.442529,0.452427,0.374202,0.845599,1.869030,0.173990,3.546434
726,(Shell_weight_1),"(Height_0, Length_3, Diameter_3)",0.454023,0.439336,0.374202,0.824191,1.875994,0.174733,3.189057
727,(Length_3),"(Height_0, Shell_weight_1, Diameter_3)",0.501277,0.393678,0.374202,0.746497,1.896211,0.176860,2.391772
728,(Diameter_3),"(Height_0, Shell_weight_1, Length_3)",0.474777,0.398787,0.374202,0.788164,1.976405,0.184867,2.838108


In [3]:
confidences = []
for i, row in rules.iterrows():
    acondition = None
    for name in row.antecedents:
        if acondition is None:
            acondition = (test[name] == 1)
        else:
            acondition = acondition & (test[name] == 1)
    
    ccondition = None
    for name in row.consequents:
        if ccondition is None:
            ccondition = (test[name] == 1)
        else:
            ccondition = ccondition & (test[name] == 1)
    
    confidences.append (len(test[acondition & ccondition]) / len(test[acondition]))

print ("test condidence = %.2f, alen = %.2f, clen = %.2f" % (np.mean(confidences), 
                                                            rules.antecedents.apply(len).mean(), 
                                                            rules.consequents.apply(len).mean()))

test condidence = 0.87, alen = 2.15, clen = 1.71
