In [17]:
import numpy as np
import pandas as pd
import random
import math
import sys

In [18]:
df = pd.read_csv("CRX/crx.data", delimiter='\t', header=None) 
df.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'label']
df.head()


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [19]:
X = pd.get_dummies(df.drop('label', axis=1))

In [20]:
X.head()

Unnamed: 0,f2,f7,f10,f14,f0_?,f0_a,f0_b,f1_13.75,f1_15.17,f1_15.75,...,f13_00680,f13_00711,f13_00720,f13_00760,f13_00840,f13_00928,f13_00980,f13_01160,f13_02000,f13_?
0,0.0,1.25,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.46,3.04,6,560,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.5,1.5,0,824,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.54,3.75,5,3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.625,1.71,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df['label'].replace('-', -1,inplace=True)
df['label'].replace('+', 1,inplace=True)
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1


In [24]:
y = df.iloc[:, -1]
df = X.join(y)

In [25]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    index_list = df.index.tolist()
    test_indexes = random.sample(population=index_list, k=test_size)

    test_df = df.loc[test_indexes]
    train_df = df.drop(test_indexes)
    
    return train_df, test_df

In [26]:
random.seed(3)
train_df, test_df = train_test_split(df, 0.20)

In [27]:
X_train = train_df.iloc[: , :-1]
y_train = train_df.iloc[: , -1]

X_test = test_df.iloc[: , :-1]
y_test = test_df.iloc[: , -1]

In [28]:
# Determines if sample shall be classified as -1 or 1 given threshold
def make_clf_dict():
    clf_dict = {'polarity': 1, 'feature_index': None, 'threshold': None, 'alpha': None}
    return clf_dict

In [29]:
def predict(X):
    n_samples = np.shape(X)[0]
    y_pred = np.zeros((n_samples, 1))
    # For each classifier => label the samples
    for clf in clfs:
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y_pred))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['polarity'] * X[clf['feature_index']] < clf['polarity'] * clf['threshold'])
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Add predictions weighted by the classifiers alpha
        # (alpha indicative of classifier's proficiency)
        y_pred += clf['alpha'] * predictions

    # Return sign of prediction sum
    y_pred = np.sign(y_pred).flatten()

    return y_pred




In [30]:
n_clf = 15

def fit(X, y):
    n_samples, n_features = np.shape(X)

    # Initialize weights to 1/N
    w = np.full(n_samples, (1 / n_samples))

    clfs = []
    # Iterate through classifiers
    for _ in range(n_clf):
        
        clf = make_clf_dict()
        # Minimum error given for using a certain feature value threshold
        # for predicting sample label
        min_error = float('inf')
        # Iterate throught every unique feature value and see what value
        # makes the best threshold for predicting y
        for feature_i in X_train.columns:
            feature_values = np.expand_dims(X[feature_i], axis=1)
            unique_values = np.unique(feature_values)
            # Try every unique feature value as threshold
            for threshold in unique_values:
                p = 1
                # Set all predictions to '1' initially
                prediction = np.ones(np.shape(y))
                # Label the samples whose values are below threshold as '-1'
                prediction[X[feature_i] < threshold] = -1
                # Error = sum of weights of misclassified samples
                error = sum(w[y != prediction])

                # If the error is over 50% we flip the polarity so that samples that
                # were classified as 0 are classified as 1, and vice versa
                # E.g error = 0.8 => (1 - error) = 0.2
                if error > 0.5:
                    error = 1 - error
                    p = -1

                # If this threshold resulted in the smallest error we save the
                # configuration
                if error < min_error:
                    clf['polarity'] = p
                    clf['threshold'] = threshold
                    clf['feature_index'] = feature_i
                    min_error = error
        # Calculate the alpha which is used to update the sample weights,
        # Alpha is also an approximation of this classifier's proficiency
        clf['alpha'] = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['polarity'] * X[clf['feature_index']] < clf['polarity'] * clf['threshold'])
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Calculate new weights 
        # Missclassified samples gets larger weights and correctly classified samples smaller
        w *= np.exp(-clf['alpha'] * y * predictions)
        # Normalize to one
        w /= np.sum(w)

        # Save classifier
        clfs.append(clf)
    return clfs



In [31]:

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# Adaboost classification with 5 weak classifiers
# X_train = np.array(X_train)
# y_train = np.array(y_train)
clfs = fit(X_train, y_train)
y_pred = predict(X_test)
y_train_pred = predict(X_train)




In [32]:
np.mean(np.array(y_test) == y_pred)

0.8768115942028986