# Mustererkennung/Machine Learning - Assignment 7 Solution


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [8]:
class Classifier:
  
    def accuracy(self, labels, predictions):
        return np.mean(labels == predictions)
    
    def confusion_matrix(self, labels, predictions):
        size = len(set(labels))
        matrix = np.zeros((size, size))
        for correct, predicted in zip(labels.astype(int), predictions.astype(int)):
            matrix[correct][predicted] += 1
        return matrix

In [9]:
data = np.array(pd.read_csv('/content/drive/MyDrive/ML_Class_2020/Bagging Trees/spambase.data', header=None))

X = data[:,:-1] # features
y = data[:,-1] # Last column is label
y[y == 0] = -1 # We need the negative labels to be -1 for AdaBoost

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, shuffle=True, stratify=y)

In [10]:
class DecisionStump(Classifier):
    def __init__(self, feature, label, alternative_label, threshold=0):
        self.feature = feature
        self.label = label
        self.alternative_label = alternative_label
        self.threshold = threshold
        
    def predict(self, X):
        return np.where(X[:, self.feature] >= self.threshold, self.label, self.alternative_label)

In [11]:
def create_classifier_pool(X):
    classifier_pool = []

    for feature in range(X.shape[1]):
        for threshold in sorted(set(X[:, feature])):
            classifier_pool += [
                DecisionStump(feature, 1, -1, threshold),
                DecisionStump(feature, -1, 1, threshold)
            ]
            
    return classifier_pool

## AdaBoost

In [12]:
class AdaBoost(Classifier):
    def __init__(self, classifier_pool, num_classifiers):
        self.classifier_pool = list(classifier_pool)
        self.num_classifiers = num_classifiers
        
        self.classifiers = []
        self.weights = []
        
    def fit(self, X, y):
        scouting_matrix = np.array([clf.predict(X) != y for clf in self.classifier_pool])
        w = np.ones(len(y)) # Initially, all weights are the same
        
        for _ in range(self.num_classifiers):
            # Step 1
            errors = scouting_matrix@w
            best_remaining = errors.argmin()
            
            # Step 2
            We = errors[best_remaining]
            W = w.sum()
            em = (W - We) / W
                        
            self.classifiers += [self.classifier_pool[best_remaining]]
            self.weights += [0.5 * -np.log((1 - em) / em)] # alphas
            
            # Step 3
            w = w * np.exp(np.where(scouting_matrix[best_remaining], 1, -1) * self.weights[-1])
            scouting_matrix = np.delete(scouting_matrix, best_remaining, axis=0)
            del self.classifier_pool[best_remaining]
        
    def predict(self, X):
        preds = np.array([cl.predict(X) for cl in self.classifiers])
        weighted_preds = np.dot(self.weights, preds)
        return np.where(weighted_preds >= 0, 1, -1)

In [13]:
classifier_pool = create_classifier_pool(X_train)
print ("Created %d weak learners" % len(classifier_pool))

Created 26586 weak learners


In [14]:
model = AdaBoost(classifier_pool, num_classifiers=100)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [15]:
model.accuracy(y_test, predictions)

0.9496090356211989