In [None]:
#Age of patient at time of operation (numerical)
#Patient's year of operation (year - 1900, numerical)
#Number of positive axillary nodes detected (numerical)
#Survival status (class attribute)
#1 = the patient survived 5 years or longer
#2 = the patient died within 5 year


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Define a simple label encoder function
def encode_column(column):
    if column.dtype == object:
        column = column.astype('category').cat.codes
    return column

# Apply the encoder to each column

# Assuming 'data.csv' is your CSV file
df = pd.read_csv('car_evaluation.csv')
df = df.apply(encode_column)
print(df.head())
# Separate the features and the target variable
#buying,maint,doors,persons,lug_boot,safety low,eval
X = df[['buying','maint','doors','persons','lug_boot','safety']].to_numpy()
y = df['eval']


# Convert the species labels to integers
species_to_int = {species: idx for idx, species in enumerate(np.unique(y))}
y_int = np.array([species_to_int[eval] for eval in y])

# Optionally, convert the integer labels to one-hot encoding
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

num_classes = len(species_to_int)
y_one_hot = one_hot_encode(y_int, num_classes)
print(y_one_hot.shape, num_classes, X.shape)
print(X)


   buying  maint  doors  persons  lug_boot  safety  eval
0       3      3      0        0         2       1     2
1       3      3      0        0         2       2     2
2       3      3      0        0         2       0     2
3       3      3      0        0         1       1     2
4       3      3      0        0         1       2     2
(1728, 4) 4 (1728, 6)
[[3 3 0 0 2 1]
 [3 3 0 0 2 2]
 [3 3 0 0 2 0]
 ...
 [1 1 3 2 0 1]
 [1 1 3 2 0 2]
 [1 1 3 2 0 0]]


In [9]:
import torch
# Convert to PyTorch tensors
x_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y_one_hot, dtype=torch.float32)

In [25]:
num_cut = [1,1,1,1,1,1]  
num_leaf = np.prod(np.array(num_cut) + 1)
num_class = 4
d = X.shape[1]
print(X.shape, y.shape, d, num_cut, num_leaf, num_class)

(1728, 6) (1728,) 6 [1, 1, 1, 1, 1] 32 4


In [27]:
# Initialize variables
cut_points_list = [torch.nn.Parameter(torch.rand(i)) for i in num_cut]
leaf_score = torch.nn.Parameter(torch.rand(num_leaf, num_class))

# Define loss and optimizer
optimizer = torch.optim.Adam([*cut_points_list, leaf_score], lr=0.1)
loss_fn = torch.nn.CrossEntropyLoss()
loss_function = torch.nn.CrossEntropyLoss()

In [28]:
from pytorch_neural import nn_decision_tree
# Training loop
for i in range(1000):
    optimizer.zero_grad()
    y_pred = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
    loss = loss_fn(y_pred, y_tensor)
    loss.backward()
    optimizer.step()

    if i % 200 == 0:
        print(loss.item())

1.4477026462554932
0.6314515471458435
0.6301831007003784
0.6259850859642029
0.6257281303405762


In [23]:
# Evaluate
with torch.no_grad():
    y_pred_eval = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
    error_rate = 1 - (y_pred_eval.argmax(1) == y_tensor.argmax(1)).float().mean()
    print('error rate %.2f' % error_rate)

error rate 0.23


In [22]:
from sklearn.metrics import accuracy_score

y_pred = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
accuracy_score(y_tensor.argmax(1), y_pred.argmax(1))
print(accuracy_score(y_tensor.argmax(1), y_pred.argmax(1)))

0.7690972222222222


In [16]:
sample_x0 = np.repeat(np.linspace(0, np.max(X[:,0]), 100), 100).reshape(-1,1)
sample_x1 = np.tile(np.linspace(0, np.max(X[:,1]), 100).reshape(-1,1), [100,1])
sample_x = np.hstack([sample_x0, sample_x1])
sample_x_tensor = torch.tensor(sample_x, dtype=torch.float32)

In [17]:
import matplotlib.pyplot as plt
with torch.no_grad():
    sample_label = nn_decision_tree(sample_x_tensor, cut_points_list, leaf_score, temperature=0.1).argmax(1)

plt.figure(figsize=(8,8))
plt.scatter(X[:,0], X[:,1], c=np.argmax(y_one_hot, axis=1), marker='o', s=50, cmap='summer', edgecolors='black')
plt.scatter(sample_x0.flatten(), sample_x1.flatten(), c=sample_label.numpy().flatten(), marker='D', s=20, cmap='summer', edgecolors='none', alpha=0.33)
plt.show()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10000x0 and 1x2)