In [1]:
from sklearn.datasets import fetch_openml

bunch = fetch_openml('adult', version = 2)
bunch.frame

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
48838,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
48839,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
48840,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

to_be_discretized = ['Sex', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week']
columns = []
bunch.frame = bunch.frame.dropna()
for column in bunch.frame.columns:
    col = bunch.frame[column]
    if column in to_be_discretized:
        col = pd.cut(col, 5, labels = range(5))
        
    col = pd.get_dummies(col, prefix = column)
    columns.append(col)
frame = pd.concat(columns, axis=1)

train, test = train_test_split(frame, stratify = bunch.frame['class'], random_state = 0)
print (len(frame), len(train), len(test))

45222 33916 11306


In [4]:
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules

frequent_itemsets = fpgrowth(train, use_colnames=True, min_support = 0.5)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(capital-gain_0),(capital-loss_0),0.991744,0.954328,0.946073,0.953948,0.999602,-0.000377,0.991744
1,(capital-loss_0),(capital-gain_0),0.954328,0.991744,0.946073,0.991349,0.999602,-0.000377,0.954328
2,(native-country_United-States),(capital-gain_0),0.912755,0.991744,0.905119,0.991634,0.999888,-0.000101,0.986762
3,(capital-gain_0),(native-country_United-States),0.991744,0.912755,0.905119,0.912653,0.999888,-0.000101,0.998833
4,(native-country_United-States),(capital-loss_0),0.912755,0.954328,0.870533,0.953742,0.999386,-0.000535,0.987331
...,...,...,...,...,...,...,...,...,...
912,"(education-num_2, capital-gain_0, capital-loss_0)",(native-country_United-States),0.539156,0.912755,0.504718,0.936126,1.025605,0.012601,1.365892
913,"(education-num_2, native-country_United-States)","(capital-gain_0, capital-loss_0)",0.526153,0.946073,0.504718,0.959260,1.013939,0.006939,1.323705
914,"(education-num_2, capital-gain_0)","(native-country_United-States, capital-loss_0)",0.560149,0.870533,0.504718,0.901042,1.035046,0.017090,1.308305
915,"(education-num_2, capital-loss_0)","(native-country_United-States, capital-gain_0)",0.540895,0.905119,0.504718,0.933115,1.030932,0.015143,1.418582


In [5]:
import numpy as np

confidences = []
for i, row in rules.iterrows():
    acondition = None
    for name in row.antecedents:
        if acondition is None:
            acondition = (test[name] == 1)
        else:
            acondition = acondition & (test[name] == 1)
    
    ccondition = None
    for name in row.consequents:
        if ccondition is None:
            ccondition = (test[name] == 1)
        else:
            ccondition = ccondition & (test[name] == 1)
    
    confidences.append (len(test[acondition & ccondition]) / len(test[acondition]))

print ("test condidence = %.2f, alen = %.2f, clen = %.2f" % (np.mean(confidences), 
                                                            rules.antecedents.apply(len).mean(), 
                                                            rules.consequents.apply(len).mean()))

test condidence = 0.85, alen = 2.24, clen = 1.72
