In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rice_ml.supervised_learning import BaggingTreeClassifier
from rice_ml.supervised_learning import DecisionTreeClassifier

df = pd.read_csv("../data/lesions_processed.csv")

X = df[["x_norm", "y_norm", "slice_norm", "r_xy", "experiment"]].values
y = df["tampered"].values.astype(int)


In [4]:
#train/test split
rng = np.random.default_rng(42)
idx = np.arange(len(y))
rng.shuffle(idx)

split = int(0.75 * len(y))
tr, te = idx[:split], idx[split:]

Xtr, Xte = X[tr], X[te]
ytr, yte = y[tr], y[te]


In [5]:
#single decision tree
tree = DecisionTreeClassifier(max_depth=4, random_state=42)
tree.fit(Xtr, ytr)

pred_tree = tree.predict(Xte)
acc_tree = np.mean(pred_tree == yte)
acc_tree


np.float64(0.7441860465116279)

In [6]:
#ensemble bagging trees
ens = BaggingTreeClassifier(
    n_estimators=25,
    max_depth=4,
    max_features=0.8,   # random-forest style feature subsampling
    random_state=42
)
ens.fit(Xtr, ytr)

pred_ens = ens.predict(Xte)
acc_ens = np.mean(pred_ens == yte)
acc_ens


np.float64(0.7441860465116279)

We train many decision trees on bootstrapped samples of the training data and aggregate their predictions by averaging predicted probabilities (majority vote). This reduces variance and often improves stability compared to a single tree.

In [7]:
#seed swap
seeds = [0, 1, 2, 3, 4]
tree_accs = []
ens_accs = []

for s in seeds:
    # single tree
    tree = DecisionTreeClassifier(max_depth=4, random_state=s)
    tree.fit(Xtr, ytr)
    pred_tree = tree.predict(Xte)
    tree_accs.append(float(np.mean(pred_tree == yte)))

    # ensemble
    ens = BaggingTreeClassifier(n_estimators=25, max_depth=4, max_features=0.8, random_state=s)
    ens.fit(Xtr, ytr)
    pred_ens = ens.predict(Xte)
    ens_accs.append(float(np.mean(pred_ens == yte)))

tree_accs, ens_accs


([0.7441860465116279,
  0.7441860465116279,
  0.7441860465116279,
  0.7441860465116279,
  0.7441860465116279],
 [0.7209302325581395,
  0.813953488372093,
  0.7906976744186046,
  0.7674418604651163,
  0.7674418604651163])

In [8]:
np.mean(tree_accs), np.std(tree_accs), np.mean(ens_accs), np.std(ens_accs)


(np.float64(0.7441860465116279),
 np.float64(0.0),
 np.float64(0.772093023255814),
 np.float64(0.030852323631213038))

Bagging is expected to reduce variance; we compare accuracy across multiple random seeds to observe stability.