In [None]:
import numpy as np
import matplotlib.pyplot as plt

class DecisionTree:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.prediction = None

def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))

def information_gain(y, y_left, y_right):
    return entropy(y) - (len(y_left) / len(y) * entropy(y_left) + len(y_right) / len(y) * entropy(y_right))

def build_tree(X, y, depth=0, max_depth=None):
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    if depth == max_depth or n_classes == 1 or n_samples < 2:
        return DecisionTree()

    best_gain = 0
    best_feature = None
    best_threshold = None

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            left_mask = X[:, feature] <= threshold
            right_mask = ~left_mask
            if np.sum(left_mask) > 0 and np.sum(right_mask) > 0:
                gain = information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

    if best_gain > 0:
        tree = DecisionTree()
        tree.feature_index = best_feature
        tree.threshold = best_threshold
        left_mask = X[:, best_feature] <= best_threshold
        tree.left = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
        tree.right = build_tree(X[~left_mask], y[~left_mask], depth + 1, max_depth)
        return tree
    else:
        leaf = DecisionTree()
        leaf.prediction = np.argmax(np.bincount(y))
        return leaf

def predict_tree(tree, x):
    if tree.prediction is not None:
        return tree.prediction
    if x[tree.feature_index] <= tree.threshold:
        return predict_tree(tree.left, x)
    else:
        return predict_tree(tree.right, x)

def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

def bagged_trees(X, y, n_trees, max_depth=None):
    trees = []
    for _ in range(n_trees):
        X_sample, y_sample = bootstrap_sample(X, y)
        tree = build_tree(X_sample, y_sample, max_depth=max_depth)
        trees.append(tree)
    return trees

def bagged_predict(trees, X):
    predictions = np.array([predict_tree(tree, x) for tree in trees for x in X])
    predictions = predictions.reshape(len(trees), len(X)).T
    return np.array([np.argmax(np.bincount(pred)) for pred in predictions])

def load_data(filename):
    data = np.genfromtxt(filename, delimiter=',', dtype=str)
    
    X = data[:, :-1]
    y = data[:, -1]
    
    X_encoded = np.zeros(X.shape, dtype=float)
    for i in range(X.shape[1]):
        try:
            X_encoded[:, i] = X[:, i].astype(float)
        except ValueError:
            unique_values = np.unique(X[:, i])
            X_encoded[:, i] = np.array([np.where(unique_values == val)[0][0] for val in X[:, i]])
    
    y = (y == 'yes').astype(int)
    
    return X_encoded, y


In [None]:
# Load and preprocess data
X_train, y_train = load_data('train_bank.csv')
X_test, y_test = load_data('test_bank.csv')

# Run Bagged Trees
T_values = [1, 5, 10, 20, 50, 100, 200, 500]
train_errors = []
test_errors = []

for T in T_values:
    trees = bagged_trees(X_train, y_train, T)
    
    train_preds = bagged_predict(trees, X_train)
    test_preds = bagged_predict(trees, X_test)
    
    train_error = np.mean(train_preds != y_train)
    test_error = np.mean(test_preds != y_test)
    
    train_errors.append(train_error)
    test_errors.append(test_error)
    
    print(f"Number of trees: {T}")
    print(f"Train error: {train_error:.4f}")
    print(f"Test error: {test_error:.4f}")
    print()

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(T_values, train_errors, label='Training Error')
plt.plot(T_values, test_errors, label='Test Error')
plt.xscale('log')
plt.xlabel('Number of Trees')
plt.ylabel('Error Rate')
plt.title('Bagged Trees Performance')
plt.legend()
plt.grid(True)
plt.show()