### PLEASE comment/document what each function is doing in your own words to demonstrate your understanding. Of course, push it up to your GitHub on completion.

In [10]:
#import relevant packages
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
# import make_hastie_10_2:
# Generates data for binary classification used in Hastie et al. 2009, Example 10.2.
from sklearn.datasets import make_hastie_10_2
import matplotlib.pyplot as plt


In [4]:
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
    '''calculates the missclassification/error rate by comparing predicted 
    values (pred) with true (Y) values'''
    return sum(pred != Y) / float(len(Y))

""" HELPER FUNCTION: PRINT ERROR RATE ======================================="""
def print_error_rate(err):
    '''prints the error rate (err) in a formatted way  '''
    print ('Error rate: Training: %.4f - Test: %.4f' % err)

""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
# input training and test data and model
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
    ''' calculates predictions with a given model and calls function to 
    calculate error rate'''
    # model fit
    clf.fit(X_train,Y_train)
    #predition using training data and test data
    pred_train = clf.predict(X_train)
    pred_test = clf.predict(X_test)
    # calling function get_error_rate and returning results for training and test data set
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [5]:
""" ADABOOST IMPLEMENTATION ================================================="""
# input training and test data, M:number of trees/modell-runs and model
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
    ''' Calculates predictions with adaboost method:
    run M-times model pridiction and fits the data in the following 
    round according to calculated weight. Weights are higher for larger errors 
    so that the next model run takes them stronger into account. After M runs, 
    function to calculate error rate is applied.
    Returns error rate of train-data and test_data'''
    
    # number of elements in train and test data set
    n_train, n_test = len(X_train), len(X_test)
    
    # Initialize weights with all the data having the same weight
    w = np.ones(n_train) / n_train
    # initialize two variables for prediction (train and test) beeing zero with the 
    # length of train/test data
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
    
    for i in range(M):
        # Fit a classifier with the specific weights
        clf.fit(X_train, Y_train, sample_weight = w)
        
        # make predictions for training and test data
        pred_train_i = clf.predict(X_train)
        pred_test_i = clf.predict(X_test)
        
        # Indicator function: 0 if both are equal, 1 if prediction is wrong
        miss = [int(x) for x in (pred_train_i != Y_train)]
        # Equivalent with 1/-1 to update weights_> turn every 0 into -1
        miss2 = [x if x==1 else -1 for x in miss]
        
        # Error, in the first run, w is equal for every entry, afterwards...
        err_m = np.dot(w,miss) / sum(w)
        
        # Alpha: how much influence this stump will have in the final classification
        alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
        
        # New weights
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
        # Add to prediction a now column with pred_train_i multiplied with alpha
        pred_train = [sum(x) for x in zip(pred_train, 
                                          [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, 
                                         [x * alpha_m for x in pred_test_i])]
    
    # change values to -1 and one depending on their initial value
    # < 0 turns to -1, > 0 turns to 1, 0 stays 0
    pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
    # Return error rate in train and test set
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [6]:

""" PLOT FUNCTION ==========================================================="""
def plot_error_rate(er_train, er_test):
    '''Plots error rate of training and test data:
    1. as solid lines in blue
    2. a dashed red, horizontal line showing the initial error of test data'''
    
    #make data frame out of the data and transpose it
    df_error = pd.DataFrame([er_train, er_test]).T
    #define column names
    df_error.columns = ['Training', 'Test']
    #initialize plot with data, 2 lines to compare test and train-data error
    plot1 = df_error.plot(linewidth = 3, figsize = (8,6),
            color = ['lightblue', 'darkblue'], grid = True)
    #set labels
    plot1.set_xlabel('Number of iterations', fontsize = 12)
    plot1.set_xticklabels(range(0,450,50))
    plot1.set_ylabel('Error rate', fontsize = 12)
    #set title
    plot1.set_title('Error rate vs number of iterations', fontsize = 16)
    #plot a horizontal, dashed line in red to identify where the initial value of test 
    #error is located
    plt.axhline(y=er_test[0], linewidth=1, color = 'red', ls = 'dashed')

