In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Excercise 1. AdaBoost
Implement AdaBoost using Python (incl. Numpy etc.) and use it on the SPAM-Dataset
.
The weak classifiers should be decision stumps (i.e. decision trees with one node).


### Import the spambase dataset

In [2]:
data = np.array(pd.read_csv('/Users/Eva/Downloads/spambase.data', header=None))

X = data[:,:-1] # features
y = data[:,-1] # Last column is label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, shuffle=True, stratify=y)

df_names = pd.read_csv('/Users/Eva/Downloads/spambase.names', header=None, names=["feature_names"], skiprows=32)
df_names["feature_names"] = df_names["feature_names"].str.replace(":  *continuous.", "", regex=True)

In [3]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3450, 57), (1151, 57), (3450,), (1151,))

### Use the Tree Implementation from Week 6

In [4]:
class ClassificationTree():


    def __init__(self, node_min_size=2, tree_max_depth=None, post_pruning=True):
        """
        If no maximum depth is defined, the nodes are split while they contain at least node_min_size many points.
        If maximum depth is defined, nodes are split further until they contain less than node_min_size many points or
        have reached the maximum number of allowed split levels.
        """
        self.depth = 1
        self.node_min_size = node_min_size
        self.tree_max_depth = tree_max_depth
        self.prune = 0
        self.post_pruning = post_pruning
        self.feature_importances = defaultdict(int)
        
    ################ Tree building ################
    
    def split_data(self, X, j, v):
        """
        Split data X at variable with index j and splitting point v.
        """
        split_left = X[np.where(X[:,j]<=v)]
        split_right = X[np.where(X[:,j]>v)]
        return split_left, split_right

    
    def gini_index(self, groups):
        """
        Calculate gini index for a data set splitted on a certain variable and splitting point.
        groups: two subsets that result from splitting a data set
        y: associated class labels
        """
        gini = 0
        if type(groups[0]) == np.ndarray:
            n = sum(map(len,groups))
            # get class proportions for each split group
            for group in groups:
                size = float(len(group))
                y = group[:,-1]
                if size == 0:
                    continue    
                score = 0
                # sum class proportions within one split groups as sum(p_mk*(1-p_mk))
                for c in set(y):
                    proportions = np.count_nonzero(y==c)/size
                    score += proportions*(1-proportions)
                # sum "inversely squared" class proportions per split group in weighted sum (weighted by proportion of group size)
                gini += score * size/n
        else:
            size = len(groups)
            for c in set(groups):
                proportions = groups.count(c)/size
                gini += proportions*(1-proportions)
        return gini


    def split_node(self, data):
        """
        Find optimal split for a given node "node" and class labels "y" that minimizes the loss function (gini index).
        """
        split_var, split_point, split_gini, groups = 9999, 9999, 9999, None
        # for every candidate split X*p* calculate gini index and select split which minimizes the score
        for j in range(data.shape[1]-1):
            candidate_p = np.unique(X[:,j])
            for v in candidate_p:
                split_groups = self.split_data(data, j, v)
                s = self.gini_index(split_groups) 
                if s < split_gini: 
                    split_var, split_point, split_gini, groups = j, v, s, split_groups
        return {'split_var': split_var, 'split_point': split_point, 'split_gini': split_gini, 'groups': groups}


    def make_leaf(self, node_labels):
        """
        Convert node "node" to leaf by assigning it the majority vote of labels "y".
        """
        leaf_labels = list(node_labels)
        return {'label': max(set(leaf_labels), key=leaf_labels.count), 'class_sizes': [leaf_labels.count(c) for c in set(leaf_labels)]} 

    
    def grow(self, node):
        """
        Recusively split nodes until stopping criteria are fulfilled,
        i.e. a node has too little data points in its region to be splitted further or the the has
        reached its maximum allowd depth.
        """
        left, right = node['groups']
        del(node['groups'])
        # Check for empty nodes (pruned in post-processing)
        if not left.any() or not right.any():
            node['left'] = node['right'] = self.make_leaf(np.r_[left[:,-1], right[:,-1]])
            return None
        # Check for max depth
        if self.tree_max_depth:
            if self.depth >= self.tree_max_depth:
                node['left'] = self.make_leaf(left[:,-1])
                node['right'] = self.make_leaf(right[:,-1])
                return None
        # Check left node size
        if len(left) <= self.node_min_size-1:
            node['left'] = self.make_leaf(left[:,-1])
        else:
            node['left'] = self.split_node(left)
            self.depth += 1
            self.grow(node['left'])
        # Check right node size
        if len(right) <= self.node_min_size:
            node['right'] = self.make_leaf(right[:,-1])
        else:
            node['right'] = self.split_node(right)
            self.depth += 1
            self.grow(node['right'])
        return None
    
    ################ Post-Pruning ################
    
    def check_twin_leaves(self, subtree):
        """
        Prune tree while there still are twin leaves
        """
        if not 'label' in subtree.keys():  
            if 'label' in subtree['left'].keys() and 'label' in subtree['right'].keys():
                if subtree['left']['label'] == subtree['right']['label']:
                    self.prune += 1
            else:
                self.check_twin_leaves(subtree['left'])
                self.check_twin_leaves(subtree['right'])

    def prune_tree(self, subtree):
        """
        If a node results in leaves of the same label, remove leaves and make node to leaf
        """
        # check if A is node or leaf
        if not 'label' in subtree.keys():  
            # at least one subtree B1, B2 has to be a node
            if not 'label' in subtree['left'].keys() or not 'label' in subtree['right'].keys():
                # prune left subtree B1?
                if not 'label' in subtree['left'].keys():
                    tmp = subtree['left']
                    # check if node has two leaves
                    if 'label' in tmp['left'] and 'label' in tmp['right']:
                        if tmp['left']['label'] == tmp['right']['label'] :
                            subtree['left'] = {'label': tmp['left']['label'], 'class_sizes':list(map(sum,list(zip(tmp['left']['class_sizes'],tmp['right']['class_sizes']))))}
                            self.prune -= 1
                            # if B2 is a leaf, decrement depth
                            if 'label' in subtree['right'].keys():
                                self.depth -= 1
                    else:
                        self.prune_tree(tmp)
                # prune right subtree B2?           
                if not 'label' in subtree['right'].keys():
                    tmp = subtree['right']
                    # check if node has two leaves
                    if 'label' in tmp['left'] and 'label' in tmp['right']:
                        if tmp['left']['label'] == tmp['right']['label'] :
                            subtree['right'] = {'label': tmp['left']['label'], 'class_sizes':list(map(sum,list(zip(tmp['left']['class_sizes'],tmp['right']['class_sizes']))))}
                            self.prune -= 1
                            # if B2 is a leaf, decrement depth
                            if 'label' in subtree['left'].keys():
                                self.depth -= 1
                    else:
                        self.prune_tree(tmp)

    ################ Build tree from data  ################
    
    def grow_tree(self, X, y):
        """
        Grow a decision tree for feature data X with class labels y that has at most tree_max_depth many levels
        and at least node_mins_size many data points that support each leaf prediction.
        """
        print('Build tree from data')
        if X.shape[0] <= self.node_min_size:
            raise Exception("Data set is too small to build a tree!")
        root = self.split_node(np.c_[X,y])
        self.grow(root)
        if self.post_pruning:
            print('Post-pruning')
            self.check_twin_leaves(root)
            while self.prune >= 1:
                self.prune_tree(root)
                self.check_twin_leaves(root)
        print('Feature importances')
        for i in range(X.shape[1]):
            self.get_feature_importance(root,i)
        f = float(sum(self.feature_importances.values()))
        for k,v in self.feature_importances.items():
            self.feature_importances[k] = v/f
        return root   
            
    ################ Classification ################
    
    def get_label(self, test, subtree):
        """
        Process data query through the decision tree and retrieve resulting leaf label.
        """
        if not 'label' in subtree.keys():
            if test[subtree['split_var']] <= subtree['split_point']:
                subtree = subtree['left']
                return self.get_label(test, subtree)
            else:
                subtree = subtree['right']
                return self.get_label(test, subtree)
        else:
            return subtree['label']
        
        
    def predict(self, tree, X):
        """
        Predict class labels for data set X.
        """
        y_pred = []
        for item in X:
            y_pred.append(self.get_label(item, tree))
        return y_pred
    
    ################ Evaluation ################
    
    def calculate_accuracy(self, true_y, pred_y, classes):
        """
        Calculate accuracy for a classified set.
        """
        class_sum = 0
        for class_num in classes:
            val_sum = 0
            for true_val, pred_val in zip(true_y, pred_y): 
                if class_num == true_val:
                    if true_val == pred_val:
                        val_sum += 1
            class_sum += val_sum
        return class_sum / len(true_y)
    
    def get_feature_importance(self, subtree, var):
        """
        This Function gets the gini loss from all subtree nodes.
        Possible TODO in the Future: weight scores by probability to reach respective node, i.e. node size/data size
        """
        if not 'label' in subtree.keys():
            if subtree['split_var'] == var:
                if 'class_sizes' in subtree['left'].keys():
                    ginil = self.gini_index(subtree['left']['class_sizes'])
                else: 
                    ginil = subtree['left']['split_gini']
                if 'class_sizes' in subtree['right'].keys():
                    ginir = self.gini_index(subtree['right']['class_sizes'])
                else:
                    ginir = subtree['right']['split_gini']
                self.feature_importances[var] += subtree['split_var']-(ginir+ginil)

            self.get_feature_importance(subtree['left'], var)
            self.get_feature_importance(subtree['right'], var)


### Implement Ada-Boost

In [5]:
class Ada_Boost:
    def __init__(self, n_predictors, tree_depth=1):
        self.n_predictors = n_predictors
        self.tree_depth = tree_depth
        self.classifier_list = []
        
    def calculate_Em(self, sample_weights, y_train, y_pred):
        return np.sum(sample_weights * (y_train == y_pred).astype(int)) / np.linalg.norm(sample_weights, ord=1)
        
    
    def train_ada_boost(self, X_train, y_train):
        N = X_train.shape[0]
        #1) initialize wi
        sample_weights = np.array([1 / N] * N)
        X_resampled = X_train.copy()
        #2) for m in {1,...,M}
        for m in range(self.n_predictors):
            # resample X_train according to the weights
            choices = np.random.choice(np.arange(N), N, p=sample_weights)
            X_resampled = X_resampled[choices]
            # set the weights of the resampled set to 1/N
            sample_weights = np.array([1 / N] * N)
            # 2a) train a classifier on X_train with weights wi
            classifier = ClassificationTree(tree_max_depth=self.tree_depth)
            tree = classifier.grow_tree(X_resampled, y_train)
            y_train_pred = classifier.predict(tree, X_resampled)
            # 2b) compute the classification error
            classification_error = self.calculate_Em(sample_weights, y_train, y_train_pred)
            # 2c) compute classifier weight
            classifier_weight = np.log((1 - classification_error) / classification_error) / 2
            self.classifier_list.append((classifier_weight, tree, classifier))
            # 2d) recompute sample weights
            sample_weights = sample_weights * np.exp(-classifier_weight * y_train * y_train_pred)
            sample_weights = sample_weights / np.linalg.norm(sample_weights, ord=1)
        
    def predict(self, X_test):
        # 3) return ensemble model output
        X_m = np.array([am * np.array(classifier.predict(tree, X_test)) for am, tree, classifier in self.classifier_list])
        return np.sign(np.sum(X_m, axis=0))

### Change the labels to -1/1

In [6]:
y_train_changed = np.where(y_train==0, -1, 1)
y_test_changed = np.where(y_test==0, -1, 1)

### Use Ada-Boost on the Spambase Dataset

In [None]:
ada = Ada_Boost(n_predictors = 3, tree_depth = 1)
ada.train_ada_boost(X_train, y_train_changed)

Build tree from data


In [105]:
ada.classifier_list

[(-0.22335153182646397,
  {'split_var': 34,
   'split_point': 1.49,
   'split_gini': 0.4758341785826028,
   'left': {'label': -1.0, 'class_sizes': [1314, 2059]},
   'right': {'label': 1.0, 'class_sizes': [45, 32]}},
  <__main__.ClassificationTree at 0x2b070076608>),
 (-0.22152492231862247,
  {'split_var': 2,
   'split_point': 2.38,
   'split_gini': 0.47622545981451764,
   'left': {'label': -1.0, 'class_sizes': [1336, 2078]},
   'right': {'label': 1.0, 'class_sizes': [23, 13]}},
  <__main__.ClassificationTree at 0x2b06e1cb0c8>),
 (-0.2221336299823221,
  {'split_var': 22,
   'split_point': 1.62,
   'split_gini': 0.47603358346867003,
   'left': {'label': -1.0, 'class_sizes': [1338, 2081]},
   'right': {'label': 1.0, 'class_sizes': [21, 10]}},
  <__main__.ClassificationTree at 0x2b06dc2eec8>)]

In [107]:
y_pred = ada.predict(y_test_changed)
print(y_pred.shape)
y_pred

IndexError: invalid index to scalar variable.

## (a) Print a confusion matrix.

In [59]:
def calculate_confusion_matrix(y_pred, y_true, class_label_list):
    """
    Returns a confusion matrix (ndarray) for all class labels given in class_label_list.
    The order of class_label_list is preserved.
    The first returned dimension(rows) are the predicted labels, the second one(columns) are the true labels.
    """
    confusion_matrix = []
    for class_label_pred in class_label_list:
        class_row = []
        for class_label_true in class_label_list:
            bool_pred = (y_pred == class_label_pred)
            bool_true = (y_true == class_label_true)
            occurrences = np.sum(np.logical_and(bool_pred, bool_true))
            class_row.append(occurrences)
        confusion_matrix.append(class_row)
    return np.array(confusion_matrix)

def display_confusion_matrix(y_pred, y_true, class_label_list):
    """Returns a labeled pandas DataFrame made from a confusion matrix (ndarray)"""
    confusion_matrix = calculate_confusion_matrix(y_pred, y_true, class_label_list)
    pred_labels = ["pred: " + str(x) for x in class_label_list]
    true_labels = ["true: " + str(x) for x in class_label_list]
    return pd.DataFrame(confusion_matrix, index=pred_labels, columns=true_labels)

In [63]:
def calculate_accuracy(true_y, pred_y, classes):
        """
        Calculate accuracy for a classified set.
        """
        class_sum = 0
        for class_num in classes:
            val_sum = 0
            for true_val, pred_val in zip(true_y, pred_y): 
                if class_num == true_val:
                    if true_val == pred_val:
                        val_sum += 1
            class_sum += val_sum
        return class_sum / len(true_y)

In [None]:
display_confusion_matrix(y_pred, y_test_changed, np.unique(y_test_changed))

In [None]:
calculate_accuracy(y_test_changed, y_pred, np.unique(y_test_changed))

## (b) Is AdaBoost better when using stronger weak learners? Why or why not? Compare your results to using depth-2 decision trees.

In [None]:
ada_depth2 = Ada_Boost(n_predictors = 5, tree_depth = 2)
ada_depth2.train_ada_boost(X_train, y_train_changed)
y_pred_depth2 = ada_depth2.predict(X_test)
y_pred_depth2

In [None]:
display_confusion_matrix(y_pred_depth2, y_test_changed, np.unique(y_test_changed))

In [None]:
calculate_accuracy(y_test_changed, y_pred_depth2, np.unique(y_test_changed))

# Excercise 2 (Bonus). Viola-Jones Face Detection
Implement the Viola-Jones algorithm (without the cascade mechanism) and use it on a
LFW-Face-subset
to classify faces.

## (a) Visualize the top ten face classifiers.

# Excercise 3 (Bonus). Cascade-Classification
Implement a cascade algorithm to classify faces in a picture of your choice (there should be
more than a face on your image, e.g. skimage.data.astronaut())