# Artificial data

In [1]:
import numpy as np

np.random.seed(0)
N = 10000
data1 = np.random.multivariate_normal(mean=[2, 2, 2, 2], cov=np.eye(4)* 0.01, size=N)
cov = np.eye(4)
cov[1, 2] = cov[2, 1] = 1
cov[2, 3] = cov[3, 2] = 1
cov[1, 3] = cov[3, 1] = 1
data2 = np.random.multivariate_normal(mean=[-8, 4, 4, 4], cov=cov* 0.01, size=N)

cov = np.eye(4)
cov[2, 3] = cov[3, 2] = 1
data3 = np.random.multivariate_normal(mean=[-8, 8, 4, 4], cov=cov* 0.01, size=N)


data = np.concatenate([data1, data2, data3], axis = 0)

import pandas as pd
names = ['V0', 'V1', 'V2', 'V3']
frame = pd.DataFrame(data, columns = names)
frame

Unnamed: 0,V0,V1,V2,V3
0,2.176405,2.040016,2.097874,2.224089
1,2.186756,1.902272,2.095009,1.984864
2,1.989678,2.041060,2.014404,2.145427
3,2.076104,2.012168,2.044386,2.033367
4,2.149408,1.979484,2.031307,1.914590
...,...,...,...,...
29995,-8.142520,8.107092,4.015167,4.015167
29996,-8.058391,7.970773,3.862682,3.862682
29997,-7.999241,7.990291,4.104746,4.104746
29998,-7.992725,7.851514,3.977892,3.977892


In [2]:
import pandas as pd
to_be_discretized = ['V0', 'V1', 'V2', 'V3']
columns = []
for column in frame.columns:
    col = frame[column]
    if column in to_be_discretized:
        col = pd.cut(col, 5, labels = range(5))
        
    col = pd.get_dummies(col, prefix = column)
    columns.append(col)
frame = pd.concat(columns, axis=1)

from sklearn.model_selection import train_test_split

train, test = train_test_split(frame, random_state = 0)
print (len(train), len(test))

22500 7500


In [3]:
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules

frequent_itemsets = fpgrowth(train, use_colnames=True, min_support = 0.5)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(V3_4),(V0_0),0.535689,0.666133,0.535689,1.0,1.501201,0.178849,inf
1,(V0_0),(V3_4),0.666133,0.535689,0.535689,0.804177,1.501201,0.178849,2.371072
2,(V0_0),(V2_4),0.666133,0.529644,0.529644,0.795103,1.501201,0.176831,2.295568
3,(V2_4),(V0_0),0.529644,0.666133,0.529644,1.0,1.501201,0.176831,inf
4,(V3_4),(V2_4),0.535689,0.529644,0.529644,0.988717,1.866755,0.24592,41.685261
5,(V2_4),(V3_4),0.529644,0.535689,0.529644,1.0,1.866755,0.24592,inf
6,"(V3_4, V0_0)",(V2_4),0.535689,0.529644,0.529644,0.988717,1.866755,0.24592,41.685261
7,"(V3_4, V2_4)",(V0_0),0.529644,0.666133,0.529644,1.0,1.501201,0.176831,inf
8,"(V0_0, V2_4)",(V3_4),0.529644,0.535689,0.529644,1.0,1.866755,0.24592,inf
9,(V3_4),"(V0_0, V2_4)",0.535689,0.529644,0.529644,0.988717,1.866755,0.24592,41.685261


In [4]:
confidences = []
for i, row in rules.iterrows():
    acondition = None
    for name in row.antecedents:
        if acondition is None:
            acondition = (test[name] == 1)
        else:
            acondition = acondition & (test[name] == 1)
    
    ccondition = None
    for name in row.consequents:
        if ccondition is None:
            ccondition = (test[name] == 1)
        else:
            ccondition = ccondition & (test[name] == 1)
    
    confidences.append (len(test[acondition & ccondition]) / len(test[acondition]))

print ("test condidence = %.2f, alen = %.2f, clen = %.2f" % (np.mean(confidences), 
                                                            rules.antecedents.apply(len).mean(), 
                                                            rules.consequents.apply(len).mean()))

test condidence = 0.94, alen = 1.25, clen = 1.25
