In [38]:
import pandas
import numpy as np
import math

In [39]:
data = pandas.read_csv('hw2_data_1.txt', sep="\t")
data['y'] = data['Species'].map({'setosa':0, 'virginica':1})
species = data['Species'].unique()
train_data = data[0:69]
train_X = train_data[['Sepal.Length','Sepal.Width']]
train_y = train_data['y']
test_data = data[70:].reset_index(drop=True)
test_X = test_data[['Sepal.Length','Sepal.Width']]
test_y = test_data['y']

In [97]:
class BaseLearner():
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.alpha = None
        
class AdaBoost():
    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        
    def fit(self, X, y):
        n_samples, n_features = np.shape(X)
        # Initialize weights to 1/N
        weights = np.full(n_samples, (1 / n_samples))
        self.estimators = []
        
        for _ in range(self.n_estimators):
            clf = BaseLearner()
            min_error = float('inf')
            for feature_i in range(n_features):
                feature_values = X[:,feature_i]
                unique_values = np.unique(feature_values)
                
                for threshold in unique_values:
                    # Set all predictions to '1' initially
                    prediction = np.ones(np.shape(y))
                    # Label the samples whose values are below threshold as 0
                    prediction[X[:, feature_i] < threshold] = 0
                    # Error = sum of weights of misclassified samples
                    error = sum(weights[y != prediction])

                    if error < min_error:
                        min_error = error
                        clf.threshold = threshold
                        clf.feature_index = feature_i
            clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
            predictions = np.ones(np.shape(y))
            predictions[ X[:, clf.feature_index] < clf.threshold ] = 0
            
            #update weights
            weights *= np.exp(-clf.alpha * y * predictions)
            #Normalize
            weights /= np.sum(weights)
            
            self.estimators.append(clf)
            
    def predict(self, X):
        n_samples = np.shape(X)[0]
        y_pred = np.zeros((n_samples, 1))
        for clf in self.estimators:
            predictions = np.ones(np.shape(y_pred))
            predictions[ X[:, clf.feature_index] < clf.threshold ] = 0
            y_pred += clf.alpha * predictions
            
        return np.sign(y_pred).flatten()

In [99]:
for i in [3, 5, 10, 20]:
    ada = AdaBoost(n_estimators=i)
    ada.fit(train_X.values, train_y)
    y_pred = ada.predict(test_X.values)
    print("Iterations: " + str(i), end=" ")
    print("\tError rate:", end=" ")
    print("%.3f" % (sum(abs(y_pred - test_y)) / len(test_y)))

Iterations: 3 	Error rate: 0.000
Iterations: 5 	Error rate: 0.000
Iterations: 10 	Error rate: 0.000
Iterations: 20 	Error rate: 0.000
