In [1]:
from sklearn.datasets import fetch_openml
from spn.algorithms.Inference import log_likelihood
from spn.algorithms.LearningWrappers import learn_parametric
from spn.structure.Base import Context
from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian, Bernoulli
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


bunch = fetch_openml('adult', version = 2)
print ("%s rows" % (len(bunch.frame)))
print (bunch.frame.shape)
bunch.frame.head()

48842 rows
(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [2]:
frame = bunch.frame.dropna()
names = frame.columns.tolist()
continuous = ['age', 'fnlwgt', 'education-num', 
              'capital-gain', 'capital-loss', 'hours-per-week']
categories = [name for name in names if name not in continuous]
categorical = [name in categories for name in names]
ptypes = [Categorical if name in categories else Gaussian for name in names]
cols = []
value_text = []
for name in names:
    if name in categories:
        values, text = pd.factorize(frame[name], sort=True)
        value_text.append(text.tolist())
        cols.append(values)
    else:
        cols.append(frame[name])
        value_text.append(None)
    
data = np.stack(cols, axis=1)

train, test = train_test_split(data, stratify = frame['class'], random_state = 0)
print ("%d in train, %d in test" % (len(train), len(test)))

33916 in train, 11306 in test


In [9]:
net = learn_parametric(train, 
                       ds_context = Context(parametric_types=ptypes).add_domains(train), 
                       rows = "gmm", ohe = True,
                       min_instances_slice = len(train) / 100)

  return array(a, dtype, copy=False, order=order, subok=True)


In [10]:
import csi2
from spn.structure.Base import get_nodes_by_type, Product



def format_condition(condition):
    try:
        a, sign, b = condition.split(" ")
    except ValueError as e:
        raise ValueError("Some nodes don't have conditions. reduce min_impurity_decrease")
    
    if sign in ('==', '!='):
        i = names.index(a)
        v = value_text[i][int(b)]
        return ("%s %s %s" % (a, sign, v))
    else:
        b = float(b)
        return ("%s %s %.2f" % (a, sign, b))

print ('#product nodes = ', len(get_nodes_by_type(net, (Product))))

csi2.annotate_spn(net, names, categorical, 
             min_impurity_decrease = 0.05, max_depth = 2)
rules = csi2.context_specific_independences(net, instance_threshold = 0)


rules = csi2.context_specific_independences(net, instance_threshold = 0)
csis = []
ac = []
cc = []
for i, rule in enumerate(rules):
    antecedent, consequent, *scores = rule
    A = csi2.format_antecedent(antecedent, format_condition)
    C = csi2.format_consequent(consequent)
    a_count = csi2.antecedent_count(A)
    c_count = csi2.consequent_count(C)
    ac.append(a_count)
    cc.append(c_count)
    csis.append ("{%s} => {%s} | %.2f, %.2f, %d | %d %d" % (A, C, *scores, a_count, c_count))
print ("%d, %.2f, %.2f" % (len(csis), np.mean(ac), np.mean(cc)))

rules = csi2.context_specific_independences(net, instance_threshold = len(train) / 20,
                                           precision_threshold = 0.7, recall_threshold = 0.7)
csis = []
ac = []
cc = []
for i, rule in enumerate(rules):
    antecedent, consequent, *scores = rule
    A = csi2.format_antecedent(antecedent, format_condition)
    C = csi2.format_consequent(consequent)
    a_count = csi2.antecedent_count(A)
    c_count = csi2.consequent_count(C)
    ac.append(a_count)
    cc.append(c_count)
    csis.append ("{%s} => {%s} | %.2f, %.2f, %d | %d %d" % (A, C, *scores, a_count, c_count))
print ("%d, %.2f, %.2f" % (len(csis), np.mean(ac), np.mean(cc)))

#product nodes =  263
263, 14.49, 4.02
19, 7.37, 2.74


In [11]:
csis

['{(race == White) & (native-country == United-States)} => {(race), (native-country), (age,workclass,education,education-num,marital-status,occupation,relationship,sex,capital-gain,hours-per-week,class), (fnlwgt), (capital-loss)} | 1.00, 1.00, 27266 | 2 5',
 '{(race != White) | ((race == White) & (native-country != United-States))} => {(age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,native-country,class), (capital-loss)} | 1.00, 1.00, 6650 | 3 2',
 '{[(race != White) | ((race == White) & (native-country != United-States))] & [capital-gain <= 57.00]} => {(capital-gain), (age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,class)} | 1.00, 1.00, 6215 | 4 2',
 '{[(race == White) & (native-country == United-States)] & [capital-gain <= 70654.50] & [capital-gain <= 57.00]} => {(capital-gain), (age,workclass,education,education-num,marital-status,occu

In [12]:
from spn.algorithms.Inference import log_likelihood

print ("%.2f" % np.mean(log_likelihood(net, test)))

-5.52
