In [1]:
import numpy as np
import pandas as pd
import random
from __future__ import division, print_function
import math

In [2]:
word_labels = ["make", "address", "all", "3d", "our", "over", "remove", "internet",
                "order", "mail", "receive", "will", "people", "report", "addresses",
                "free", "business", "email", "you", "credit", "your", "font", "000",
                "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
                "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
                "meeting", "original", "project", "re", "edu", "table", "conference", "char_freq1", "char_freq2", "char_freq3", 
              "char_freq4", "char_freq5", "char_freq6", "cap_run_length_avg", "cap_run_length_longest", "cap_run_length_total", "label"]
df = pd.read_csv("../spambase/spambase.data", names = word_labels, header=None) 
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
df['label'].replace(0, -1,inplace=True)
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    index_list = df.index.tolist()
    test_indexes = random.sample(population=index_list, k=test_size)

    test_df = df.loc[test_indexes]
    train_df = df.drop(test_indexes)
    
    return train_df, test_df

In [5]:
random.seed(3)
train_df, test_df = train_test_split(df, 0.20)

In [6]:
X_train = train_df.iloc[: , :-1]
y_train = train_df.iloc[: , -1]

X_test = test_df.iloc[: , :-1]
y_test = test_df.iloc[: , -1]

In [7]:
# Determines if sample shall be classified as -1 or 1 given threshold
def make_clf_dict():
    clf_dict = {'polarity': 1, 'feature_index': None, 'threshold': None, 'alpha': None}
    return clf_dict

In [8]:
def predict(X):
    n_samples = np.shape(X)[0]
    y_pred = np.zeros((n_samples, 1))
    # For each classifier => label the samples
    for clf in clfs:
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y_pred))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['polarity'] * X[clf['feature_index']] < clf['polarity'] * clf['threshold'])
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Add predictions weighted by the classifiers alpha
        # (alpha indicative of classifier's proficiency)
        y_pred += clf['alpha'] * predictions

    # Return sign of prediction sum
    y_pred = np.sign(y_pred).flatten()

    return y_pred




In [9]:
n_clf = 15

def fit(X, y):
    n_samples, n_features = np.shape(X)

    # Initialize weights to 1/N
    w = np.full(n_samples, (1 / n_samples))

    clfs = []
    # Iterate through classifiers
    for _ in range(n_clf):
        
        clf = make_clf_dict()
        # Minimum error given for using a certain feature value threshold
        # for predicting sample label
        min_error = float('inf')
        # Iterate throught every unique feature value and see what value
        # makes the best threshold for predicting y
        for feature_i in X_train.columns:
            feature_values = np.expand_dims(X[feature_i], axis=1)
            unique_values = np.unique(feature_values)
            # Try every unique feature value as threshold
            for threshold in unique_values:
                p = 1
                # Set all predictions to '1' initially
                prediction = np.ones(np.shape(y))
                # Label the samples whose values are below threshold as '-1'
                prediction[X[feature_i] < threshold] = -1
                # Error = sum of weights of misclassified samples
                error = sum(w[y != prediction])

                # If the error is over 50% we flip the polarity so that samples that
                # were classified as 0 are classified as 1, and vice versa
                # E.g error = 0.8 => (1 - error) = 0.2
                if error > 0.5:
                    error = 1 - error
                    p = -1

                # If this threshold resulted in the smallest error we save the
                # configuration
                if error < min_error:
                    clf['polarity'] = p
                    clf['threshold'] = threshold
                    clf['feature_index'] = feature_i
                    min_error = error
        # Calculate the alpha which is used to update the sample weights,
        # Alpha is also an approximation of this classifier's proficiency
        clf['alpha'] = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['polarity'] * X[clf['feature_index']] < clf['polarity'] * clf['threshold'])
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Calculate new weights 
        # Missclassified samples gets larger weights and correctly classified samples smaller
        w *= np.exp(-clf['alpha'] * y * predictions)
        # Normalize to one
        w /= np.sum(w)

        # Save classifier
        clfs.append(clf)
    return clfs



In [10]:

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# Adaboost classification with 5 weak classifiers
# X_train = np.array(X_train)
# y_train = np.array(y_train)
clfs = fit(X_train, y_train)
y_pred = predict(X_test)
y_train_pred = predict(X_train)




In [11]:
l = list()
a = 5
b = 6
c = 9
l.append(a)
l.append(b)
l.append(c)

In [12]:
l

[5, 6, 9]

In [13]:
np.mean(np.array(y_test) == y_pred)

0.9065217391304348