In [17]:
import numpy as np
import pandas as pd
import random
import math
import sys

In [18]:
df = pd.read_csv("CRX/crx.data", delimiter='\t', header=None) 
df.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'label']
df.head()


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [19]:
X = pd.get_dummies(df.drop('label', axis=1))

In [20]:
X.head()

Unnamed: 0,f2,f7,f10,f14,f0_?,f0_a,f0_b,f1_13.75,f1_15.17,f1_15.75,...,f13_00680,f13_00711,f13_00720,f13_00760,f13_00840,f13_00928,f13_00980,f13_01160,f13_02000,f13_?
0,0.0,1.25,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.46,3.04,6,560,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.5,1.5,0,824,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.54,3.75,5,3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.625,1.71,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df['label'].replace('-', -1,inplace=True)
df['label'].replace('+', 1,inplace=True)
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1


In [24]:
y = df.iloc[:, -1]
df = X.join(y)

In [25]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    index_list = df.index.tolist()
    test_indexes = random.sample(population=index_list, k=test_size)

    test_df = df.loc[test_indexes]
    train_df = df.drop(test_indexes)
    
    return train_df, test_df

In [26]:
random.seed(3)
train_df, test_df = train_test_split(df, 0.20)

In [27]:
X_train = train_df.iloc[: , :-1]
y_train = train_df.iloc[: , -1]

X_test = test_df.iloc[: , :-1]
y_test = test_df.iloc[: , -1]

In [28]:
# Determines if sample shall be classified as -1 or 1 given threshold
def make_clf_dict():
    clf_dict = {'label': 1, 'column': None, 'threshold': None, 'alpha': None}
    return clf_dict

In [29]:
def predict(X):
    n_samples = np.shape(X)[0]
    y_pred = np.zeros((n_samples, 1))
    for clf in classifiers:
        predictions = np.ones(np.shape(y_pred))
        index_neg = (clf['label'] * X[clf['column']] < clf['label'] * clf['threshold'])
        predictions[index_neg] = -1
        y_pred += clf['alpha'] * predictions

    y_pred = np.sign(y_pred).flatten()

    return y_pred




In [30]:
iters = 15

def adaboost(X, y):
    n_samples, n_features = np.shape(X)
    w = np.full(n_samples, (1 / n_samples))

    classifiers = []

    for i in range(iters):
       
      clf = make_clf_dict()

        min_err = float('inf')

        for column in X_train.columns:
            feature_values = np.expand_dims(X[column], axis=1)
            unique_values = np.unique(feature_values)

            for threshold in unique_values:
                p = 1

                prediction = np.ones(np.shape(y))
                prediction[X[column] < threshold] = -1
                error = sum(w[y != prediction])

                if error > 0.5:
                    error = 1 - error
                    p = -1

                if error < min_err:
                    clf['label'] = p
                    clf['threshold'] = threshold
                    clf['column'] = column
                    min_err = error
        clf['alpha'] = 0.5 * math.log((1.0 - min_err) / (min_err + 1e-10))
        predictions = np.ones(np.shape(y))
        index_neg = (clf['label'] * X[clf['column']] < clf['label'] * clf['threshold'])
        predictions[index_neg] = -1
        w *= np.exp(-clf['alpha'] * y * predictions)
        w /= np.sum(w)

        classifiers.append(clf)
    return classifiers



In [31]:
classifiers = adaboost(X_train, y_train)
y_pred = predict(X_test)
y_train_pred = predict(X_train)




In [32]:
np.mean(np.array(y_test) == y_pred)

0.8768115942028986