In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from architecture_yfinal.deep_binary_classifier import DeepBinaryClassifier
from architecture_yfinal.ripper_node import make_ripper_node

In [2]:
small_dataset_df = pd.read_csv("./data/10_bit_artificial/107.csv")
small_X_df = small_dataset_df.drop(columns="class").to_numpy(bool)
small_y_df = small_dataset_df["class"].to_numpy(bool)

small_X_train, small_X_test, small_y_train, small_y_test\
    = train_test_split(small_X_df, small_y_df, test_size=0.2, random_state=42, stratify=small_y_df)

large_dataset_df = pd.read_csv("./data/100_bit_artificial/1a.csv")
large_X_df = large_dataset_df.drop(columns="class").to_numpy(bool)
large_y_df = large_dataset_df["class"].to_numpy(bool)

large_X_train, large_X_test, large_y_train, large_y_test\
    = train_test_split(large_X_df, large_y_df, test_size=0.2, random_state=42, stratify=large_y_df)

print("Small dataset shapes:")
print(f"X_train: {small_X_train.shape}, X_test: {small_X_test.shape}")
print(f"y_train: {small_y_train.shape}, y_test: {small_y_test.shape}")

print("\nLarge dataset shapes:")
print(f"X_train: {large_X_train.shape}, X_test: {large_X_test.shape}")
print(f"y_train: {large_y_train.shape}, y_test: {large_y_test.shape}")

Small dataset shapes:
X_train: (819, 10), X_test: (205, 10)
y_train: (819,), y_test: (205,)

Large dataset shapes:
X_train: (8000, 100), X_test: (2000, 100)
y_train: (8000,), y_test: (2000,)


In [3]:
config = dict(
    layer_node_counts=[8]*3 + [1],
    layer_bit_counts=[4]*4,
    seed=42
)

X_train = small_X_train
y_train = small_y_train
X_test = small_X_test
y_test = small_y_test

net = DeepBinaryClassifier(**config, node_factory=make_ripper_node, jobs=8)
%time net.fit(X_train, y_train)
pred_test = net.predict(X_test)
acc_test = accuracy_score(y_test, pred_test)
print(f"Accuracy on test set: {acc_test:.6f}")

CPU times: user 52.5 ms, sys: 148 ms, total: 200 ms
Wall time: 4.81 s
Accuracy on test set: 0.848780


In [4]:
node_truth_table, col_names = net.layers[1][1].get_truth_table()
node_truth_table_df = pd.DataFrame(node_truth_table, columns=col_names)
node_truth_table_df

Unnamed: 0,L1N1,L1N3,L1N5,L1N6,L2N1 (output)
0,True,True,True,True,True
1,True,True,True,False,True
2,True,True,False,True,True
3,True,True,False,False,True
4,True,False,True,True,True
5,True,False,True,False,False
6,True,False,False,True,False
7,True,False,False,False,False
8,False,True,True,True,False
9,False,True,True,False,False


In [6]:
node = net.layers[1][1] # layer indexing seems fucked up too

#node.reduce_expression()

node_expr = node.get_expression()
print(node_expr)

node_truth_table, col_names = node.get_truth_table()
node_truth_table_df = pd.DataFrame(node_truth_table, columns=col_names)

# evaluate sympy expression to verify truth table matches


node_truth_table_df

(L1N1 & L1N3) | (L1N1 & L1N5 & L1N6) | (L1N1 & L1N3 & ~L1N5) | (L1N1 & L1N3 & L1N5 & L1N6)


Unnamed: 0,L1N1,L1N3,L1N5,L1N6,L2N1 (output)
0,True,True,True,True,True
1,True,True,True,False,True
2,True,True,False,True,True
3,True,True,False,False,True
4,True,False,True,True,True
5,True,False,True,False,False
6,True,False,False,True,False
7,True,False,False,False,False
8,False,True,True,True,False
9,False,True,True,False,False


In [7]:
node.input_names

['L1N1', 'L1N3', 'L1N5', 'L1N6']