In [1]:
import numpy as np
import pandas as pd
import random
import math
import sys

In [2]:
# df = pd.read_csv("vote/vote.data", delimiter='\t', header=None) 
# df.head()

In [3]:
# format_outputs(df)

In [4]:
def fold_dataset(dataset, k):

    np.random.shuffle(dataset)
    dataset = np.array_split(dataset, k)

    return dataset


def format_outputs(dataset):

    for instance in dataset:
        if instance[-1] == "d":
            instance[-1] = 1
        else:
            instance[-1] = -1
    return dataset


def separate_attributes(dataset):
#     Separate inputs from outputs and return a dictionary.
    
#     Now it is possible to convert outputs to true integers. 
    
#     Arguments: dataset -- Dataset represented as numpy array of arrays.


    dataset = {
        "input": dataset[:, 0:-1],
        "output": dataset[:, -1].astype(int)
    }
    return dataset

In [5]:
class AdaBoost:
    

    def __init__(self, training_set, testing_set):

        self.training_set = training_set
        self.testing_set = testing_set

        self.m_tr = training_set["input"].shape[0] 
          self.n_tr = training_set["input"].shape[1]  

        self.m_ts = testing_set["input"].shape[0]

        self.weights = np.divide(np.ones(self.m_tr), self.m_tr)  

        self.ensemble = [] 

        self.alpha = []  


    def evaluate_stump(self, stump):
  

        predictions = np.zeros(self.m_tr)  
        pred_errors = np.ones(self.m_tr)  
        a = stump["attribute"]  
        for i in range(self.m_tr):
            value = self.training_set["input"][i][a]
            output = self.training_set["output"][i]
            if value == stump["value"]:
                predictions[i] = stump["state"]
            else:
                predictions[i] = stump["state"] * -1
            if predictions[i] == output:
                pred_errors[i] = 0

        error = np.sum(np.multiply(self.weights, pred_errors))

        return error, predictions


    def find_best_stump(self):

        best_stump = {}
        lowest_error = float("inf")
        possible_values = ["y", "n", "?"]
        possible_states = [1, -1]
        for a in range(self.n_tr):
            for value in possible_values:
                for state in possible_states:
                    stump = {"attribute": a}
                    stump["value"] = value

                    stump["state"] = state  

                    error, predictions = self.evaluate_stump(stump)
                    stump["error"] = error
                    stump["predictions"] = predictions

                    if error < lowest_error:
                        lowest_error = error
                        best_stump = stump

        return best_stump


    def calculate_alpha(self, model):     
        error = model["error"]
        alpha = 0.5 * np.log((1 - error) / error)
        
        return alpha


    def update_weights(self, model, alpha):

        self.weights = np.multiply(self.weights, 
                                   np.exp(-1 * alpha 
                                     * np.multiply(self.training_set["output"],
                                                   model["predictions"])
                                   )
                       )
        
        self.weights = np.divide(self.weights, np.sum(self.weights))


    def evaluate_ensemble(self):

        correct = 0
        for i in range(self.m_ts):
            H = 0  
            for model in range(len(self.ensemble)):
                a = self.ensemble[model]["attribute"]
                value = self.testing_set["input"][i][a]
                if value == self.ensemble[model]["value"]:
                    prediction = self.ensemble[model]["state"]
                else:
                    prediction = self.ensemble[model]["state"] * -1
                H += self.alpha[model] * prediction
            H = np.sign(H)  

            if H == self.testing_set["output"][i]:
                correct += 1

        accuracy = (correct / self.m_ts) * 100  
        error = 100 - accuracy

        return accuracy, error


    def boost(self, num_iterations):

        accuracies = []  
        errors = []  
        model_errors = []  
        for i in range(num_iterations):
            best_model = self.find_best_stump()
            model_errors.append(best_model["error"] * 100)
            self.ensemble.append(best_model)
            self.alpha.append(self.calculate_alpha(best_model)) 

            results = self.evaluate_ensemble()
            accuracies.append(results[0])
            errors.append(results[1])

            self.update_weights(best_model, self.alpha[i])
        return accuracies, errors, model_errors


In [6]:
cv_accuracies = []
cv_errors = []
cv_model_errors = []

In [7]:

k = 10  

dataset = pd.read_csv("vote/vote.data", delimiter='\t', header=None) 
dataset = np.array(dataset)
dataset = format_outputs(dataset)
dataset = fold_dataset(dataset, k)


for i in range(k):
    testing_set = separate_attributes(dataset[i])
    remaining_folds = np.concatenate(np.delete(dataset, i))
    training_set = separate_attributes(remaining_folds)

    ada = AdaBoost(training_set, testing_set)
    results = ada.boost(301)

    cv_accuracies.append(results[0])
    cv_errors.append(results[1])
    cv_model_errors.append(results[2])


cv_accuracies = np.asarray(cv_accuracies)
cv_errors = np.asarray(cv_errors)
cv_model_errors = np.asarray(cv_model_errors)


cv_accuracies = np.divide(np.sum(cv_accuracies, axis=0), k)
cv_errors = np.divide(np.sum(cv_errors, axis=0), k)
cv_model_errors = np.divide(np.sum(cv_model_errors, axis=0), k)



In [10]:
cv_accuracies[0]

95.63424947145879