In [11]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from architecture_yfinal.deep_binary_classifier import DeepBinaryClassifier
from architecture_yfinal.ripper_node_mine import make_ripper_node

In [12]:
small_dataset_df = pd.read_csv("./data/10_bit_artificial/107.csv")
small_X_df = small_dataset_df.drop(columns="class").to_numpy(bool)
small_y_df = small_dataset_df["class"].to_numpy(bool)

small_X_train, small_X_test, small_y_train, small_y_test\
    = train_test_split(small_X_df, small_y_df, test_size=0.2, random_state=42, stratify=small_y_df)

large_dataset_df = pd.read_csv("./data/100_bit_artificial/1a.csv")
large_X_df = large_dataset_df.drop(columns="class").to_numpy(bool)
large_y_df = large_dataset_df["class"].to_numpy(bool)

large_X_train, large_X_test, large_y_train, large_y_test\
    = train_test_split(large_X_df, large_y_df, test_size=0.2, random_state=42, stratify=large_y_df)

print("Small dataset shapes:")
print(f"X_train: {small_X_train.shape}, X_test: {small_X_test.shape}")
print(f"y_train: {small_y_train.shape}, y_test: {small_y_test.shape}")

print("\nLarge dataset shapes:")
print(f"X_train: {large_X_train.shape}, X_test: {large_X_test.shape}")
print(f"y_train: {large_y_train.shape}, y_test: {large_y_test.shape}")

Small dataset shapes:
X_train: (819, 10), X_test: (205, 10)
y_train: (819,), y_test: (205,)

Large dataset shapes:
X_train: (8000, 100), X_test: (2000, 100)
y_train: (8000,), y_test: (2000,)


In [13]:
config = dict(
    layer_node_counts=[8]*3 + [1],
    layer_bit_counts=[4]*4,
    seed=42
)

X_train = small_X_train
y_train = small_y_train
X_test = small_X_test
y_test = small_y_test

net = DeepBinaryClassifier(**config, node_factory=make_ripper_node, jobs=8)
%time net.fit(X_train, y_train)
pred_test = net.predict(X_test)
acc_test = accuracy_score(y_test, pred_test)
print(f"Accuracy on test set: {acc_test:.6f}")

TypeError: Can't instantiate abstract class RipperNode with abstract method get_metadata

ValueError: Found input variables with inconsistent numbers of samples: [205, 2050]

In [4]:
node = net.layers[1][1] # layer indexing seems fucked up too

node.reduce_expr()

node_expr = node.get_expr()
print(node_expr)

node_truth_table, col_names = node.get_truth_table()
node_truth_table_df = pd.DataFrame(node_truth_table, columns=col_names)

# evaluate sympy expression to verify truth table matches


node_truth_table_df

(L1N6 & ~L1N5) | (L1N1 & L1N3 & L1N5)


Unnamed: 0,L1N3,L1N6,L1N5,L1N1,L2N1 (output)
0,False,False,False,False,False
1,True,False,False,False,False
2,False,True,False,False,True
3,True,True,False,False,True
4,False,False,True,False,False
5,True,False,True,False,False
6,False,True,True,False,False
7,True,True,True,False,False
8,False,False,False,True,False
9,True,False,False,True,False


In [5]:
node.test_node()

Hitting new one


In [6]:
from sympy import lambdify, symbols

print(node_expr)

# Build a vectorized function from the node's expression
syms = [symbols(nm) for nm in node.input_names]
f = lambdify(syms, node_expr, "numpy")

# Evaluate expression on all input columns
node_truth_table_df["expr_eval"] = f(*[node_truth_table_df[nm].values for nm in node.input_names])

# Compare to the stored output
node_truth_table_df["matches"] = (
        node_truth_table_df[f"{node.name} (output)"] == node_truth_table_df["expr_eval"]
)

node_truth_table_df

(L1N6 & ~L1N5) | (L1N1 & L1N3 & L1N5)


Unnamed: 0,L1N3,L1N6,L1N5,L1N1,L2N1 (output),expr_eval,matches
0,False,False,False,False,False,False,True
1,True,False,False,False,False,False,True
2,False,True,False,False,True,True,True
3,True,True,False,False,True,True,True
4,False,False,True,False,False,False,True
5,True,False,True,False,False,False,True
6,False,True,True,False,False,False,True
7,True,True,True,False,False,False,True
8,False,False,False,True,False,False,True
9,True,False,False,True,False,False,True


In [8]:
from sympy import lambdify, symbols
import numpy as np

# Take the node's expression
expr = node.get_expr()

# Build callable from sympy expression
syms = [symbols(nm) for nm in node.input_names]
f = lambdify(syms, expr, "numpy")

# Recompute predictions directly from expression
recomputed = f(*[node_truth_table_df[nm].values for nm in node.input_names])

# Compare with output column
matches = np.all(recomputed == node_truth_table_df[f"{node.name} (output)"].values)

print("Do outputs match expression? ->", matches)


Do outputs match expression? -> True


In [8]:
### print out feature names
feature_names = node.input_names
print(feature_names)

['L1N3', 'L1N6', 'L1N5', 'L1N1']
