In [None]:
#Age of patient at time of operation (numerical)
#Patient's year of operation (year - 1900, numerical)
#Number of positive axillary nodes detected (numerical)
#Survival status (class attribute)
#1 = the patient survived 5 years or longer
#2 = the patient died within 5 year


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Define a simple label encoder function
def encode_column(column):
    if column.dtype == object:
        column = column.astype('category').cat.codes
    return column

# Apply the encoder to each column

# Assuming 'data.csv' is your CSV file
df = pd.read_csv('titanic.csv')
df = df.apply(encode_column)
print(df.head())
# Separate the features and the target variable

#PassengserId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
X = df[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]].to_numpy()
#X = df[['buying','maint','doors','persons','lug_boot','safety']].to_numpy()
y = df['Survived'].to_numpy()


# Convert the species labels to integers
species_to_int = {species: idx for idx, species in enumerate(np.unique(y))}
y_int = np.array([species_to_int[eval] for eval in y])

# Optionally, convert the integer labels to one-hot encoding
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

num_classes = len(species_to_int)
y_one_hot = one_hot_encode(y_int, num_classes)
print(y_one_hot.shape, num_classes, X.shape)
print(X)


   PassengerId  Survived  Pclass  Name  Sex   Age  SibSp  Parch  Ticket  \
0          892         0       3   206    1  34.5      0      0     152   
1          893         1       3   403    0  47.0      1      0     221   
2          894         0       2   269    1  62.0      0      0      73   
3          895         0       3   408    1  27.0      0      0     147   
4          896         1       3   178    0  22.0      1      1     138   

      Fare  Cabin  Embarked  
0   7.8292     -1         1  
1   7.0000     -1         2  
2   9.6875     -1         1  
3   8.6625     -1         2  
4  12.2875     -1         2  
(418, 2) 2 (418, 7)
[[ 3.      1.     34.5    ...  0.      7.8292  1.    ]
 [ 3.      0.     47.     ...  0.      7.      2.    ]
 [ 2.      1.     62.     ...  0.      9.6875  1.    ]
 ...
 [ 3.      1.     38.5    ...  0.      7.25    2.    ]
 [ 3.      1.         nan ...  0.      8.05    2.    ]
 [ 3.      1.         nan ...  1.     22.3583  0.    ]]


In [3]:
import torch
# Convert to PyTorch tensors
x_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y_one_hot, dtype=torch.float32)

In [12]:
num_cut = [1,1,1,1,1,1]  
num_leaf = np.prod(np.array(num_cut) + 1)
num_class = 2
d = X.shape[1]
print(X.shape, y.shape, d, num_cut, num_leaf, num_class)

(418, 7) (418,) 7 [1, 1, 1, 1, 1, 1] 64 2


In [13]:
# Initialize variables
cut_points_list = [torch.nn.Parameter(torch.rand(i)) for i in num_cut]
leaf_score = torch.nn.Parameter(torch.rand(num_leaf, num_class))

# Define loss and optimizer
optimizer = torch.optim.Adam([*cut_points_list, leaf_score], lr=0.1)
loss_fn = torch.nn.CrossEntropyLoss()
loss_function = torch.nn.CrossEntropyLoss()

In [14]:
from pytorch_neural import nn_decision_tree
# Training loop
for i in range(1000):
    optimizer.zero_grad()
    y_pred = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
    loss = loss_fn(y_pred, y_tensor)
    loss.backward()
    optimizer.step()

    if i % 200 == 0:
        print(loss.item())

nan
nan
nan
nan
nan


In [15]:
# Evaluate
with torch.no_grad():
    y_pred_eval = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
    error_rate = 1 - (y_pred_eval.argmax(1) == y_tensor.argmax(1)).float().mean()
    print('error rate %.2f' % error_rate)

error rate 0.36


In [17]:
from sklearn.metrics import accuracy_score

y_pred = nn_decision_tree(x_tensor, cut_points_list, leaf_score, temperature=0.1)
accuracy_score(y_tensor.argmax(1), y_pred.argmax(1))
print(accuracy_score(y_tensor.argmax(1), y_pred.argmax(1)))

0.6363636363636364


In [10]:
sample_x0 = np.repeat(np.linspace(0, np.max(X[:,0]), 100), 100).reshape(-1,1)
sample_x1 = np.tile(np.linspace(0, np.max(X[:,1]), 100).reshape(-1,1), [100,1])
sample_x = np.hstack([sample_x0, sample_x1])
sample_x_tensor = torch.tensor(sample_x, dtype=torch.float32)

In [11]:
import matplotlib.pyplot as plt
with torch.no_grad():
    sample_label = nn_decision_tree(sample_x_tensor, cut_points_list, leaf_score, temperature=0.1).argmax(1)

plt.figure(figsize=(8,8))
plt.scatter(X[:,0], X[:,1], c=np.argmax(y_one_hot, axis=1), marker='o', s=50, cmap='summer', edgecolors='black')
plt.scatter(sample_x0.flatten(), sample_x1.flatten(), c=sample_label.numpy().flatten(), marker='D', s=20, cmap='summer', edgecolors='none', alpha=0.33)
plt.show()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10000x0 and 1x2)