In [2]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

bunch = fetch_openml(data_id = 181)
bunch.frame

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class_protein_localization
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [3]:
from sklearn.model_selection import train_test_split

frame = bunch.frame
to_be_discretized = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc']
columns = []
for column in frame.columns:
    col = frame[column]
    if column in to_be_discretized:
        col = pd.cut(col, 5, labels = range(5))
        
    col = pd.get_dummies(col, prefix = column)
    columns.append(col)
frame = pd.concat(columns, axis=1)
train, test = train_test_split(frame, stratify = bunch.frame.class_protein_localization, random_state = 0)
print (len(frame), len(train), len(test))

1484 1113 371


In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules

frequent_itemsets = fpgrowth(train, use_colnames=True, min_support = 0.5)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(pox_0),(erl_0),0.989218,0.991015,0.980234,0.990917,0.999901,-9.7e-05,0.989218
1,(erl_0),(pox_0),0.991015,0.989218,0.980234,0.989121,0.999901,-9.7e-05,0.991015
2,(nuc_1),(erl_0),0.87062,0.991015,0.861635,0.98968,0.998653,-0.001162,0.87062
3,(erl_0),(nuc_1),0.991015,0.87062,0.861635,0.869447,0.998653,-0.001162,0.991015
4,(nuc_1),(pox_0),0.87062,0.989218,0.860737,0.988648,0.999424,-0.000496,0.949767
5,(pox_0),(nuc_1),0.989218,0.87062,0.860737,0.870118,0.999424,-0.000496,0.996136
6,(nuc_1),(vac_3),0.87062,0.879605,0.774483,0.889577,1.011337,0.008682,1.090309
7,(vac_3),(nuc_1),0.879605,0.87062,0.774483,0.88049,1.011337,0.008682,1.08259
8,"(nuc_1, pox_0)",(erl_0),0.860737,0.991015,0.851752,0.989562,0.998533,-0.001251,0.860737
9,"(nuc_1, erl_0)",(pox_0),0.861635,0.989218,0.851752,0.98853,0.999304,-0.000593,0.939966


In [5]:
import numpy as np

confidences = []
for i, row in rules.iterrows():
    acondition = None
    for name in row.antecedents:
        if acondition is None:
            acondition = (test[name] == 1)
        else:
            acondition = acondition & (test[name] == 1)
    
    ccondition = None
    for name in row.consequents:
        if ccondition is None:
            ccondition = (test[name] == 1)
        else:
            ccondition = ccondition & (test[name] == 1)
    
    confidences.append (len(test[acondition & ccondition]) / len(test[acondition]))

print ("test condidence = %.2f, alen = %.2f, clen = %.2f" % (np.mean(confidences), 
                                                            rules.antecedents.apply(len).mean(), 
                                                            rules.consequents.apply(len).mean()))

test condidence = 0.90, alen = 1.52, clen = 1.52
