In [1]:
import pandas as pd


df = pd.read_csv("data/titanic.csv").drop(
    columns=["PassengerId", "Name", "Ticket", "Cabin"]
)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [3]:
import numpy as np


X = df.drop(columns=["Survived"]).to_numpy()
y = df["Survived"].astype(np.int8).to_numpy()
feature_types = [
    "categorical",
    "categorical",
    "numeric",
    "categorical",
    "categorical",
    "numeric",
    "categorical",
]

X.shape, y.shape

((891, 7), (891,))

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
from tree_id3 import DecisionTreeID3
from rich.syntax import Syntax
from rich.console import Console


tree = DecisionTreeID3(max_depth=5, criterion="donskoy")
tree.fit(X_train, y_train, feature_types)

console = Console()
tree_str = str(tree)
syntax = Syntax(tree_str, "python", theme="light")
console.print(syntax)

In [6]:
tree.prune(X_test, y_test)
tree_str = str(tree)
syntax = Syntax(tree_str, "python", theme="light")
console.print(syntax)


In [7]:
from evaluate import evaluate_classifier

tree = DecisionTreeID3(max_depth=5, criterion="entropy")

# Before pruning
before_pruning_scores = evaluate_classifier(
    tree, X_train, y_train, X_test, y_test, feature_types
)

# After pruning
tree.prune(X_test, y_test)
after_pruning_scores = evaluate_classifier(
    tree, X_train, y_train, X_test, y_test, feature_types
)

# Create markdown table comparing before and after pruning
print("|Metric|Before Pruning|After Pruning|")
print("|------|--------------|-------------|")
print(f"|Accuracy|{before_pruning_scores.accuracy:.4f}|{after_pruning_scores.accuracy:.4f}|")
print(f"|Precision|{before_pruning_scores.precision:.4f}|{after_pruning_scores.precision:.4f}|") 
print(f"|Recall|{before_pruning_scores.recall:.4f}|{after_pruning_scores.recall:.4f}|")
print(f"|F1|{before_pruning_scores.f1:.4f}|{after_pruning_scores.f1:.4f}|")



|Metric|Before Pruning|After Pruning|
|------|--------------|-------------|
|Accuracy|0.6480|0.6480|
|Precision|0.6533|0.6533|
|Recall|0.6480|0.6480|
|F1|0.6094|0.6094|


In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Create masks for categorical and numeric features
categorical_mask = [ft == "categorical" for ft in feature_types]
numeric_mask = [ft == "numeric" for ft in feature_types]

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_mask),
        ("cat", OneHotEncoder(sparse_output=False), categorical_mask),
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

custom_tree = DecisionTreeID3(max_depth=5, criterion="entropy")
custom_tree.fit(X_train, y_train, feature_types)
custom_tree.prune(X_test, y_test)

custom_scores = evaluate_classifier(
    custom_tree, X_train, y_train, X_test, y_test, feature_types
)

clf = DecisionTreeClassifier(max_depth=5)
sklearn_scores = evaluate_classifier(
    clf, X_train_processed, y_train, X_test_processed, y_test, feature_types
)

print("|Metric|Custom|sklearn|")
print("|------|--------------|-------------|")
print(f"|Accuracy|{custom_scores.accuracy:.4f}|{sklearn_scores.accuracy:.4f}|")
print(f"|Precision|{custom_scores.precision:.4f}|{sklearn_scores.precision:.4f}|") 
print(f"|Recall|{custom_scores.recall:.4f}|{sklearn_scores.recall:.4f}|")
print(f"|F1|{custom_scores.f1:.4f}|{sklearn_scores.f1:.4f}|")

|Metric|Custom|sklearn|
|------|--------------|-------------|
|Accuracy|0.6480|0.7933|
|Precision|0.6533|0.7977|
|Recall|0.6480|0.7933|
|F1|0.6094|0.7876|
