In [1]:
import numpy as np
import pandas as pd

import wittgenstein as lw
from sympy import symbols, And, Or, Not, false, simplify_logic

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from architecture.classifier import DeepBinaryClassifier
from architecture.nodes.ripper import make_ripper_node

In [2]:
train_df = pd.read_csv("./data/mnist/mnist_train.csv")
test_df = pd.read_csv("./data/mnist/mnist_test.csv")
column_names = train_df.drop(columns="label").columns

input_values_train = train_df.drop(columns="label").to_numpy(bool)
target_values_train = train_df["label"].to_numpy(bool)

input_values_test = test_df.drop(columns="label").to_numpy(bool)
target_values_test = test_df["label"].to_numpy(bool)


print("Dataset shapes:")
print(f"TRAIN | input: {input_values_train.shape}, target: {target_values_train.shape}, pos. frac.: {target_values_train.mean():.3f}")
print(f"TEST  | input: {input_values_test.shape}, target: {target_values_test.shape}, pos. frac.: {target_values_test.mean():.3f}")

Dataset shapes:
TRAIN | input: (10000, 196), target: (10000,), pos. frac.: 0.486
TEST  | input: (1500, 196), target: (1500,), pos. frac.: 0.497


In [10]:
#dataset_df = pd.read_csv("./data/100_bit_artificial/1a.csv")
#input_values = dataset_df.drop(columns="class").to_numpy(bool)
#target_values = dataset_df["class"].to_numpy(bool)
#column_names = dataset_df.drop(columns="class").columns
#
## balance
#pos_idx = np.where(target_values == 1)[0]
#neg_idx = np.where(target_values == 0)[0]
#
#num_samples = min(len(pos_idx), len(neg_idx))
#
#balanced_pos_idx = resample(pos_idx, n_samples=num_samples, replace=False, random_state=42)
#balanced_neg_idx = resample(neg_idx, n_samples=num_samples, replace=False, random_state=42)
#
#balanced_idx = np.concatenate([balanced_pos_idx, balanced_neg_idx])
#input_values, target_values = input_values[balanced_idx], target_values[balanced_idx]
#
## split
#input_values_train, input_values_test, target_values_train, target_values_test \
#    = train_test_split(input_values, target_values, test_size=0.2, random_state=42, stratify=target_values)
#
#print("Dataset shapes:")
#print(f"TRAIN | input: {input_values_train.shape}, target: {target_values_train.shape}, pos. frac.: {target_values_train.mean():.3f}")
#print(f"TEST  | input: {input_values_test.shape}, target: {target_values_test.shape}, pos. frac.: {target_values_test.mean():.3f}")

Dataset shapes:
TRAIN | input: (6790, 100), target: (6790,), pos. frac.: 0.500
TEST  | input: (1698, 100), target: (1698,), pos. frac.: 0.500


In [3]:
input_values_train_df = pd.DataFrame(input_values_train, columns=column_names)
input_values_test_df  = pd.DataFrame(input_values_test, columns=column_names)
target_values_train_df = pd.Series(target_values_train, name="class")
target_values_test_df  = pd.Series(target_values_test, name="class")

ripper = lw.RIPPER(random_state=42)
%time ripper.fit(pd.concat([input_values_train_df, target_values_train_df], axis=1), class_feat="class", pos_class=True)

train_acc = ripper.score(input_values_train_df, target_values_train_df)
test_acc  = ripper.score(input_values_test_df, target_values_test_df)

print(f"Test accuracy:     {test_acc:.3f}")

CPU times: user 42.1 s, sys: 533 ms, total: 42.6 s
Wall time: 43.5 s
Test accuracy:     0.887


In [4]:
def _ruleset_to_expression(ruleset):
    if not ruleset:
        return false

    conjunctions = []
    for rule in ruleset:
        literals = []
        for cond in rule.conds:
            feature = symbols(cond.feature)
            literal = feature if cond.val else Not(feature)
            literals.append(literal)

        conjunction = And(*literals)
        conjunctions.append(conjunction)

    disjunction = Or(*conjunctions)
    return disjunction


expr = _ruleset_to_expression(ripper.ruleset_)
print("Original expression:")
print(expr)
simple_expr = simplify_logic(expr, form="dnf")
print("\nSimplified expression:")
print(simple_expr)

Original expression:
(p177 & ~p103) | (p105 & p130 & p76 & p90) | (p119 & p173 & p51 & ~p148) | (p106 & p63 & p65 & p73 & p87) | (p133 & p174 & ~p104 & ~p65) | (p106 & p174 & p62 & p64 & ~p48) | (p106 & p49 & p74 & p87 & ~p147) | (p134 & p161 & p176 & p49 & ~p34) | (p173 & p62 & p63 & p92 & ~p129) | (p161 & p73 & p91 & ~p134 & ~p148) | (p49 & p74 & p89 & ~p102 & ~p75) | (p60 & p75 & p76 & ~p115 & ~p62) | (p73 & p75 & p78 & ~p102 & ~p51) | (p104 & p131 & p162 & p74 & p89 & ~p102) | (p61 & p76 & ~p116 & ~p133 & ~p63) | (p74 & p92 & ~p106 & ~p148 & ~p78) | (p106 & p163 & p48 & p73 & ~p143 & ~p66) | (p106 & p36 & p62 & p89 & ~p64 & ~p79) | (p117 & p144 & p76 & p91 & ~p108 & ~p64) | (p117 & p61 & p78 & p87 & ~p50 & ~p51) | (p49 & p62 & p74 & p92 & ~p63 & ~p65) | (p118 & p146 & p92 & ~p106 & ~p34 & ~p78) | (p49 & p75 & p89 & ~p102 & ~p34 & ~p76) | (p61 & p75 & p91 & ~p106 & ~p120 & ~p62) | (p75 & p90 & p91 & ~p133 & ~p76 & ~p80) | (p78 & p90 & p94 & ~p105 & ~p51 & ~p64) | (p106 & p63 & ~p120

In [None]:
layer_node_counts = [32] * 4
config = dict(layer_node_counts=layer_node_counts, layer_bit_counts=[16]*len(layer_node_counts), seed=42)

net = DeepBinaryClassifier(**config, node_factory=make_ripper_node, jobs=8)
%time net.fit(input_values_train, target_values_train)

pred_values_test = net.predict(input_values_test)
ref_values_test = np.broadcast_to(target_values_test[:, None], pred_values_test.shape)
accuracies_test = (pred_values_test == ref_values_test).mean(axis=0)

best_output_node_idx = int(np.argmax(accuracies_test))
best_output_node_name = net.node_names[-1][best_output_node_idx]

print("Best node:", best_output_node_name, "with accuracy", accuracies_test[best_output_node_idx])