In [1]:
import numpy as np
from collections import Counter

class Node:
    def __init__(self, attribute=None, threshold=None, label=None, branches=None):
        self.attribute = attribute
        self.threshold = threshold
        self.label = label
        self.branches = branches or {}

def entropy(data, label_index):
    labels = [row[label_index] for row in data]
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))

def information_gain(data, attribute, label_index, criterion='info_gain'):
    total_entropy = entropy(data, label_index)
    weighted_entropy = 0
    
    if isinstance(data[0][attribute], (int, float)):
        values = [float(row[attribute]) for row in data if row[attribute] != 'unknown']
        threshold = np.median(values)
        left_subset = [row for row in data if row[attribute] != 'unknown' and float(row[attribute]) <= threshold]
        right_subset = [row for row in data if row[attribute] != 'unknown' and float(row[attribute]) > threshold]
        
        if left_subset:
            weighted_entropy += len(left_subset) / len(data) * entropy(left_subset, label_index)
        if right_subset:
            weighted_entropy += len(right_subset) / len(data) * entropy(right_subset, label_index)
    else:
        for value in set(row[attribute] for row in data if row[attribute] != 'unknown'):
            subset = [row for row in data if row[attribute] == value]
            weighted_entropy += len(subset) / len(data) * entropy(subset, label_index)
    
    return total_entropy - weighted_entropy

def choose_best_attribute(data, attributes, label_index, criterion='info_gain'):
    best_gain = -float('inf')
    best_attribute = None
    best_threshold = None
    
    for attribute in attributes:
        gain = information_gain(data, attribute, label_index, criterion)
        if gain > best_gain:
            best_gain = gain
            best_attribute = attribute
            if isinstance(data[0][attribute], (int, float)):
                values = [float(row[attribute]) for row in data if row[attribute] != 'unknown']
                best_threshold = np.median(values)
    
    return best_attribute, best_threshold

def id3(data, attributes, label_index, max_depth, criterion='info_gain'):
    labels = [row[label_index] for row in data]
    
    if not labels:
        return Node(label=None)
    
    if len(set(labels)) == 1:
        return Node(label=labels[0])
    
    if len(attributes) == 0 or (max_depth is not None and max_depth <= 0):
        return Node(label=max(set(labels), key=labels.count))
    
    best_attribute, threshold = choose_best_attribute(data, attributes, label_index, criterion)
    node = Node(attribute=best_attribute, threshold=threshold)
    
    if threshold is not None:
        left_subset = [row for row in data if row[best_attribute] != 'unknown' and float(row[best_attribute]) <= threshold]
        right_subset = [row for row in data if row[best_attribute] != 'unknown' and float(row[best_attribute]) > threshold]
        if left_subset:
            node.branches[f"<={threshold}"] = id3(left_subset, attributes, label_index, max_depth-1 if max_depth is not None else None, criterion)
        if right_subset:
            node.branches[f">{threshold}"] = id3(right_subset, attributes, label_index, max_depth-1 if max_depth is not None else None, criterion)
    else:
        for value in set(row[best_attribute] for row in data if row[best_attribute] != 'unknown'):
            subset = [row for row in data if row[best_attribute] == value]
            if subset:
                node.branches[value] = id3(subset, attributes, label_index, max_depth-1 if max_depth is not None else None, criterion)
    
    if not node.branches:
        return Node(label=max(set(labels), key=labels.count))
    
    return node

def predict(node, instance):
    if node.label is not None:
        return node.label
    if node.attribute >= len(instance):
        return None
    value = instance[node.attribute]
    if node.threshold is not None:
        if value == 'unknown':
            return None
        if float(value) <= node.threshold:
            branch = f"<={node.threshold}"
        else:
            branch = f">{node.threshold}"
    else:
        branch = value
    if branch not in node.branches:
        return None
    return predict(node.branches[branch], instance)

def bootstrap_sample(data):
    n_samples = len(data)
    return [data[np.random.randint(n_samples)] for _ in range(n_samples)]

def load_data(filename):
    data = np.genfromtxt(filename, delimiter=',', dtype=str)
    return data.tolist()

def bagged_trees(data, n_trees, max_depth):
    label_index = len(data[0]) - 1
    attributes = list(range(len(data[0]) - 1))
    
    trees = []
    for _ in range(n_trees):
        sample = bootstrap_sample(data)
        tree = id3(sample, attributes, label_index, max_depth)
        trees.append(tree)
    
    return trees

def bagged_predict(trees, instance):
    predictions = [predict(tree, instance) for tree in trees]
    predictions = [p for p in predictions if p is not None]
    return max(set(predictions), key=predictions.count) if predictions else None


In [None]:
train_data = load_data('train_bank.csv')
test_data = load_data('test_bank.csv')

# Experiment
n_repeats = 100
n_samples = 1000
n_trees = 500

single_tree_predictions = []
bagged_tree_predictions = []

for _ in range(n_repeats):
    sampled_data = np.random.choice(len(train_data), n_samples, replace=False)
    sampled_data = [train_data[i] for i in sampled_data]
    
    trees = bagged_trees(sampled_data, n_trees, max_depth=None)
    
    single_tree_predictions.append([predict(trees[0], instance) for instance in test_data])
    bagged_tree_predictions.append([bagged_predict(trees, instance) for instance in test_data])

true_labels = np.array([1 if instance[-1] == 'yes' else 0 for instance in test_data])
single_tree_predictions = np.array([[1 if pred == 'yes' else 0 for pred in preds] for preds in single_tree_predictions])
bagged_tree_predictions = np.array([[1 if pred == 'yes' else 0 for pred in preds] for preds in bagged_tree_predictions])

single_tree_mean_pred = np.mean(single_tree_predictions, axis=0)
single_tree_bias = np.mean((single_tree_mean_pred - true_labels) ** 2)
single_tree_variance = np.mean(np.var(single_tree_predictions, axis=0))
single_tree_error = single_tree_bias + single_tree_variance

bagged_tree_mean_pred = np.mean(bagged_tree_predictions, axis=0)
bagged_tree_bias = np.mean((bagged_tree_mean_pred - true_labels) ** 2)
bagged_tree_variance = np.mean(np.var(bagged_tree_predictions, axis=0))
bagged_tree_error = bagged_tree_bias + bagged_tree_variance

print("Single Tree Results:")
print(f"Bias: {single_tree_bias:.4f}")
print(f"Variance: {single_tree_variance:.4f}")
print(f"General Squared Error: {single_tree_error:.4f}")

print("\nBagged Trees Results:")
print(f"Bias: {bagged_tree_bias:.4f}")
print(f"Variance: {bagged_tree_variance:.4f}")
print(f"General Squared Error: {bagged_tree_error:.4f}")
