# Preliminaries

In [16]:
# import modules
from typing import Tuple, List

import numpy as np

In [17]:
# base classes

class Node:
    pass


class Tree:
    def __init__(self):
        self.root = Node()

    def find_leaf(self, x) -> Node:
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

# Density Tree

In [18]:
class DensityTree(Tree):
    def __init__(self):
        super(DensityTree, self).__init__()

    def train(self, data, prior, n_min=20):
        '''
        data: the feature matrix for the digit under consideration
        prior: the prior probability of this digit
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        self.prior = prior
        N, D = data.shape
        D_try = int(np.sqrt(D))  # number of features to consider for each split decision

        # find and remember the tree's bounding box, 
        # i.e. the lower and upper limits of the training feature set
        m, M = calc_bbox(data)
        self.box = m, M

        # identify invalid features and adjust the bounding box
        # (If m[j] == M[j] for some j, the bounding box has zero volume, 
        #  causing divide-by-zero errors later on. We must exclude these
        #  features from splitting and adjust the bounding box limits 
        #  such that invalid features have no effect on the volume.)
        valid_features = np.where(m != M)[0]
        invalid_features = np.where(m == M)[0]
        M[invalid_features] = m[invalid_features] + 1

        # initialize the root node
        self.root.data = data
        self.root.box = m.copy(), M.copy()

        # build the tree
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0]  # number of instances in present node
            if n >= n_min:
                # Call 'make_density_split_node()' with 'D_try' randomly selected 
                # indices from 'valid_features'. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                indices = np.random.choice(np.arange(0, len(valid_features)), D_try)
                left, right = make_density_split_node(node, N, indices)
                stack.append(left)
                stack.append(right)
            else:
                # Call 'make_density_leaf_node()' to turn 'node' into a leaf node.
                make_density_leaf_node(node, N)

    def predict(self, x):
        leaf = self.find_leaf(x)
        # return p(x | y) * p(y) if x is within the tree's bounding box 
        # and return 0 otherwise
        return leaf.response

In [None]:
def calc_loo_error(N_m: int, N: int, V_m: float) -> float:
    return -(2 * N_m * (N_m - 1)) / (N * (N - 1) * V_m) + (N_m ** 2) / ((N ** 2) * V_m)


def calc_volume(bounding_box: Tuple[np.ndarray, np.ndarray]):
    m, M = bounding_box
    diff = M - m

    return np.prod(diff)


def make_density_split_node(node, N, feature_indices):
    '''
    node: the node to be split
    N:    the total number of training instances for the current class
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape
    m, M = node.box

    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = float("inf")
    j_min, t_min = None, None

    for j in feature_indices:
        # Hint: For each feature considered, first remove duplicate feature values using 
        # 'np.unique()'. Describe here why this is necessary.
        # It's necessary because otherwise if two instances have the same feature value the mean between them is the feature value self so the threshold would not be in the mid between feature values anymore
        data_unique = np.unique(node.data[:, j])
        # Compute candidate thresholds
        tj = 0.5 * (data_unique[1:] + data_unique[:-1])

        # Illustration: for loop - hint: vectorized version is possible
        for t in tj:
            # Compute the error
            left = node.data[np.where(node.data[:, j] <= t)]
            right = node.data[np.where(node.data[:, j] > t)]

            loo_err_m = calc_loo_error(len(node.data), N, calc_volume(node.box))
            loo_err_l = calc_loo_error(len(left), N, calc_volume(calc_bbox(left)))
            loo_err_r = calc_loo_error(len(right), N, calc_volume(calc_bbox(right)))

            loo_error = (loo_err_l + loo_err_r) - loo_err_m

            # choose the best threshold that
            if loo_error < e_min:
                e_min = loo_error
                j_min = j
                t_min = t

    # create children
    left = Node()
    right = Node()

    # initialize 'left' and 'right' with the data subsets and bounding boxes
    # according to the optimal split found above
    left.data = node.data[np.where(node.data[:, j_min] <= t_min)]  # store data in left node -- for subsequent splits
    left.box = calc_bbox(left.data)  # store bounding box in left node
    right.data = node.data[np.where(node.data[:, j_min] > t_min)]
    right.box = calc_bbox(right.data)

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right

In [20]:
def make_density_leaf_node(node, N):
    '''
    node: the node to become a leaf
    N:    the total number of training instances for the current class
    '''
    # compute and store leaf response
    n = node.data.shape[0]
    v = calc_volume(node.box)
    node.response = n / (N * v)

# Decision Tree

In [21]:
class DecisionTree(Tree):
    def __init__(self):
        super(DecisionTree, self).__init__()

    def train(self, data, labels, n_min=20):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D))  # how many features to consider for each split decision

        # initialize the root node
        self.root.data = data
        self.root.labels = labels

        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0]  # number of instances in present node
            if n >= n_min and not node_is_pure(node):
                # Call 'make_decision_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                ...  # your code here
            else:
                # Call 'make_decision_leaf_node()' to turn 'node' into a leaf node.
                ...  # your code here

    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return ...  # your code here

In [22]:
def make_decision_split_node(node, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    # find best feature j (among 'feature_indices') and best threshold t for the split
    ...  # your code here

    # create children
    left = Node()
    right = Node()

    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    left.data = ...  # data in left node
    left.labels = ...  # corresponding labels
    right.data = ...
    right.labels = ...

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = ...
    node.threshold = ...

    # return the children (to be placed on the stack)
    return left, right

In [23]:
def make_decision_leaf_node(node):
    '''
    node: the node to become a leaf
    '''
    # compute and store leaf response
    node.N = ...
    node.response = ...  # your code here

In [24]:
def node_is_pure(node):
    '''
    check if 'node' ontains only instances of the same digit
    '''
    return ...  # your code here

# Evaluation of Density and Decision Tree

In [25]:
# read and prepare the digits data
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits


def calc_bbox(data) -> (np.ndarray, np.ndarray):
    return np.min(data, axis=0).copy(), np.max(data, axis=0).copy()


digits = load_digits()

data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]

# Removing features where min value == max value == 0, because this feature does not contain any information (it's the same for all instances)
smallest, biggest = calc_bbox(data)
distances = biggest - smallest
print(distances.shape)

dims_with_information = np.where(distances > 0)[0]
print(dims_with_information)

reduced_data = data[:, dims_with_information]
print(reduced_data.shape)

# Normalizing to values between 0 and 2
normalized_data = reduced_data / 16
print(calc_bbox(normalized_data))

(64,)
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 33 34 35 36 37 38 40 41 42 43 44 45 46 47 48 49 50
 51 52 53 54 55 56 57 58 59 60 61 62 63]
(1797, 61)
(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0.5   , 1.    , 1.    , 1.    , 1.    , 1.    , 0.9375, 0.125 ,
       1.    , 1.    , 1.    , 1.    , 1.    , 1.    , 0.75  , 0.125 ,
       1.    , 1.    , 1.    , 1.    , 1.    , 1.    , 0.5   , 0.0625,
       0.9375, 1.    , 1.    , 1.    , 1.    , 0.9375, 0.0625, 0.875 ,
       1.    , 1.    , 1.    , 1.    , 0.875 , 0.25  , 1.    , 1.    ,
       1.    , 1.    , 1.    , 1.    , 0.375 , 0.5   , 1.    , 1.    ,
       1.    , 1.    , 1.    , 1.    , 0.8125, 0.0625, 0.5625, 1.    ,
       1.    , 1.    , 1

In [26]:
# train trees, plot training error confusion matrices, and comment on your results
density_trees: List[DensityTree] = []
for number in range(10):
    indices = np.where(target == number)
    filtered_data = normalized_data[indices]
    prior = len(filtered_data) / len(data)

    density_tree = DensityTree()
    density_tree.train(filtered_data, prior)

calculated_target = np.zeros(len(target))

for i, instance in enumerate(normalized_data):
    p_max = -1
    num_max = -1
    for number, tree in enumerate(density_trees):
        p = tree.predict(instance)
        if p > p_max:
            p_max = p
            num_max = number
    calculated_target[i] = num_max

density_tree_err = calculated_target != target
print(density_tree_err)

density_tree_err_rate = np.sum(density_tree_err) / len(target)
print(density_tree_err_rate)

  return -(2 * N_m * (N_m - 1)) / (N * (N - 1) * V_m) + (N_m ** 2) / ((N ** 2) * V_m)
  return -(2 * N_m * (N_m - 1)) / (N * (N - 1) * V_m) + (N_m ** 2) / ((N ** 2) * V_m)


TypeError: '<=' not supported between instances of 'float' and 'NoneType'

# Density and Decision Forest

In [None]:
class DensityForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DensityTree() for i in range(n_trees)]

    def train(self, data, prior, n_min=20):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            ...  # your code here

    def predict(self, x):
        # compute the ensemble prediction
        return ...  # your code here

In [None]:
class DecisionForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DecisionTree() for i in range(n_trees)]

    def train(self, data, labels, n_min=0):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            ...  # your code here

    def predict(self, x):
        # compute the ensemble prediction
        return ...  # your code here

# Evaluation of Density and Decision Forest

In [None]:
# train forests (with 20 trees per forest), plot training error confusion matrices, and comment on your results
...  # your code here
