# Mustererkennung/Machine Learning - Assignment 6



In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

### Load the spam dataset:

In [2]:
data = np.array(pd.read_csv('spambase.data', header=None))

X = data[:,:-1] # features
y = data[:,-1] # Last column is label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, shuffle=True, stratify=y)


In [24]:
df = pd.read_csv('spambase.data', header=None)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
5,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1
6,0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,...,0.0,0.054,0.0,0.164,0.054,0.0,1.671,4,112,1
7,0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,...,0.0,0.206,0.0,0.0,0.0,0.0,2.45,11,49,1
8,0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,...,0.0,0.271,0.0,0.181,0.203,0.022,9.744,445,1257,1
9,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,...,0.04,0.03,0.0,0.244,0.081,0.0,1.729,43,749,1


# Exercise 1. Decision Trees
Implement a decision tree (classification tree to be precise) using Python (incl. Numpy etc.)
and use it on the SPAM-Dataset1
. Use a metric of your choice as a loss function.

In [84]:
class ClassificationTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.root = None
        
    class LeafNode:
        def __init__(self, parent, class_id):
            self.parent = parent
            self.class_id = class_id

    class InnerNode:
        def __init__(self, parent, l_child, r_child, split_feature, split_value):
            self.parent = parent
            self.l_child = l_child
            self.r_child = r_child
            self.split_feature = split_feature
            self.split_value = split_value

        def classify(self, X_sample):
            return_class = None
            if X_sample[self.split_feature] > self.split_value:
                if type(self.r_child) is type(self):
                    return_class = self.r_child.classify(X_sample)
                else:
                    return_class = self.r_child.class_id
            else:
                if type(self.r_child) is type(self):
                    return_class = self.l_child.classify(X_sample)
                else:
                    return_class = self.l_child.class_id
            return return_class
    
    def p_mk(self, Ym, k):
        return np.where(Ym == k)[0].shape[0] / Ym.shape[0]

    def gini_index(self, groups, classes):
        return np.sum([np.sum([self.p_mk(Ym, k) * (1 - self.p_mk(Ym, k)) for k in classes]) for Ym in groups])
    
    def minimize_loss_function(self, Xm, Ym):
        min_Q_tot = np.inf
        min_j, min_z = None, None 
        classes = np.unique(Ym)
        for j in range(Xm.shape[1]):
            xi = Xm[:, j]
            zi = np.array([((xi[i] + xi[i+1]) / 2) for i in range(Xm.shape[0] - 1)])
            for z in zi:
                left_index = np.where(xi <= z)
                right_index = np.where(xi > z)
                Ym_left, Ym_right = (Ym[left_index], Ym[right_index])
                Q_tot = self.gini_index([Ym_left, Ym_right], classes)
                if Q_tot < min_Q_tot:
                    min_Q_tot = Q_tot
                    min_j, min_z = j, z
        return (min_j, min_z)
    
    def get_inner_node(self, Xm, Ym, depth):
        min_j, min_z = self.minimize_loss_function(X_train, y_train)
        left_index = np.where(Xm[:, min_j] <= min_z)
        right_index = np.where(Xm[:, min_j] > min_z)
        Xm_left, Xm_right = Xm[left_index], Xm[right_index]
        Ym_left, Ym_right = Ym[left_index], Ym[right_index]
        if self.max_depth - depth > 0:
            depth += 1
            inner_node = self.InnerNode(None, self.get_inner_node(Xm_left, Ym_left, depth), 
                                        self.get_inner_node(Xm_right, Ym_right, depth), min_j, min_z)
        else:
            inner_node = self.InnerNode(None, self.get_leaf_node(Ym_left), self.get_leaf_node(Ym_right), min_j, min_z)
        return inner_node
    
    def get_leaf_node(self, Ym):
        if len(Ym) > 1:
            return self.LeafNode(None, np.argmax(np.bincount(Ym.astype(int))))
        else:
            return self.LeafNode(None, None)
    
    def fit(self, X_train, y_train):
        min_j, min_z = self.minimize_loss_function(X_train, y_train)
        left_index = np.where(X_train[:, min_j] <= min_z)
        right_index = np.where(X_train[:, min_j] > min_z)
        Xm_left, Xm_right = X_train[left_index], X_train[right_index]
        Ym_left, Ym_right = y_train[left_index], y_train[right_index]
        if self.max_depth > 1:
            root_node = self.InnerNode(None, self.get_inner_node(Xm_left, Ym_left, 1), 
                                       self.get_inner_node(Xm_right, Ym_right, 1), min_j, min_z)
        else:
            root_node = self.InnerNode(None, self.get_leaf_node(Ym_left), self.get_leaf_node(Ym_right), min_j, min_z)
        self.root = root_node
    
    def classify(self, X):
        if not self.root is None:
            return np.array([self.root.classify(X[i,:]) for i in range(X.shape[0])])
        else:
            return None
    
    def show_tree(self):
        self.root

def calculate_accuracy(true_y, pred_y, classes):
        """
        Calculate accuracy for a classified set.
        """
        class_sum = 0
        for class_num in classes:
            val_sum = 0
            for true_val, pred_val in zip(true_y, pred_y): 
                if class_num == true_val:
                    if true_val == pred_val:
                        val_sum += 1
            class_sum += val_sum
        return class_sum / len(true_y)

In [91]:
#X_train, X_test, y_train, y_test
CT = ClassificationTree(max_depth=3)
CT.fit(X_train, y_train)

In [92]:
y_pred = CT.classify(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [93]:
print("classes: ", np.unique(y_test.astype(int)))
print("accuracy:", calculate_accuracy(y_test.astype(int), y_pred, np.unique(y_test.astype(int))))

classes:  [0 1]
accuracy: 0.6255430060816681


## (a) Assume that classifying a genuine E-Mail as spam is ten times worse than classifying spam as genuine. How would you change the design of your decision tree?

## (b) Use your tree to analyze feature importance. Plot the difference between the top 5 features (check spambase.names to check what features those belong to).

# Exercise 2. Random Forests
Implement a Random Forest and use it on the SPAM-Dataset.

## (a) Print a confusion matrix (you can use package implementations here).

## (b) What is a good number of trees in the forest?