In [1]:
import numpy as np
import pandas as pd
import random
from __future__ import division, print_function
import math

In [2]:
word_labels = ["make", "address", "all", "3d", "our", "over", "remove", "internet",
                "order", "mail", "receive", "will", "people", "report", "addresses",
                "free", "business", "email", "you", "credit", "your", "font", "000",
                "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
                "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
                "meeting", "original", "project", "re", "edu", "table", "conference", "char_freq1", "char_freq2", "char_freq3", 
              "char_freq4", "char_freq5", "char_freq6", "cap_run_length_avg", "cap_run_length_longest", "cap_run_length_total", "label"]
df = pd.read_csv("../spambase/spambase.data", names = word_labels, header=None) 
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
df['label'].replace(0, -1,inplace=True)
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    index_list = df.index.tolist()
    test_indexes = random.sample(population=index_list, k=test_size)

    test_df = df.loc[test_indexes]
    train_df = df.drop(test_indexes)
    
    return train_df, test_df

In [5]:
random.seed(3)
train_df, test_df = train_test_split(df, 0.20)

In [6]:
X_train = train_df.iloc[: , :-1]
y_train = train_df.iloc[: , -1]

X_test = test_df.iloc[: , :-1]
y_test = test_df.iloc[: , -1]

In [7]:
# Determines if sample shall be classified as -1 or 1 given threshold
def make_clf_dict():
    clf_dict = {'label': 1, 'feature_name': None, 'threshold': None, 'alpha': None}
    return clf_dict

In [8]:
def predict(X):
    n_samples = np.shape(X)[0]
    y_pred = np.zeros((n_samples, 1))
    # For each classifier => label the samples
    for clf in clfs:
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y_pred))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['label'] * X[clf['feature_name']] < clf['label'] * clf['threshold'])
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Add predictions weighted by the classifiers alpha
        # (alpha indicative of classifier's proficiency)
        y_pred += clf['alpha'] * predictions

    # Return sign of prediction sum
    y_pred = np.sign(y_pred).flatten()

    return y_pred




In [9]:
n_clf = 15

def fit(X, y):
    n_samples, n_features = np.shape(X)

    # Initialize weights to 1/N
    w = np.full(n_samples, (1 / n_samples))

    clfs = []
    # Iterate through classifiers
    for _ in range(n_clf):
        diff = []
        clf = make_clf_dict()
        # Minimum error given for using a certain feature value threshold
        # for predicting sample label
        min_error = float('inf')
        # Iterate throught every unique feature value and see what value
        # makes the best threshold for predicting y
        for column in X_train.columns:
            feature_values = np.expand_dims(X[column], axis=1)
            unique_values = np.unique(feature_values)
            # Try every unique feature value as threshold
            for threshold in unique_values:
                p = 1
                # Set all predictions to '1' initially
                prediction = np.ones(np.shape(y))
                # Label the samples whose values are below threshold as '-1'
                prediction[X[column] < threshold] = -1
                # Error = sum of weights of misclassified samples
                error = sum(w[y != prediction])

                # If the error is over 50% we flip the label so that samples that
                # were classified as 0 are classified as 1, and vice versa
                # E.g error = 0.8 => (1 - error) = 0.2
                if error > 0.5:
                    error = 1 - error
                    p = -1

                # If this threshold resulted in the smallest error we save the
                # configuration
                if error < min_error:
                    clf['label'] = p
                    clf['threshold'] = threshold
                    clf['feature_name'] = column
                    min_error = error
        # Calculate the alpha which is used to update the sample weights,
        # Alpha is also an approximation of this classifier's proficiency
        clf['alpha'] = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
        # Set all predictions to '1' initially
        predictions = np.ones(np.shape(y))
        # The indexes where the sample values are below threshold
        negative_idx = (clf['label'] * X[clf['feature_name']] < clf['label'] * clf['threshold'])
        diff.append(abs(clf['label'] * X[clf['feature_name']] - clf['label'] * clf['threshold']))
        diff.sort()
        s = len(diff)
        per = round((s*2)/100)
        dat = diff[:per]
        # Label those as '-1'
        predictions[negative_idx] = -1
        # Calculate new weights 
        # Missclassified samples gets larger weights and correctly classified samples smaller
        w *= np.exp(-clf['alpha'] * y * predictions)
        # Normalize to one
        w /= np.sum(w)

        # Save classifier
        clfs.append(clf)
    return clfs



In [10]:
clfs = fit(X_train, y_train)
y_pred = predict(X_test)
y_train_pred = predict(X_train)




0       0.694
1       0.288
2       0.192
3       0.053
4       0.051
5       0.084
6       0.080
7       0.084
8       0.097
9       0.160
10      0.378
11      0.579
12      0.702
16      0.344
17      1.891
18      0.371
20      0.645
21      0.166
23      0.583
24      0.308
25      0.166
26      0.308
27      0.284
30      0.802
31      2.028
32      0.084
33      0.084
34      0.084
35      1.459
36      0.789
        ...  
4566    0.084
4567    0.084
4568    0.084
4569    0.084
4570    0.084
4571    0.002
4572    0.084
4573    0.015
4574    0.084
4575    0.084
4576    0.084
4577    0.084
4580    0.084
4582    0.084
4583    0.084
4584    0.043
4586    0.084
4587    0.084
4588    0.084
4589    0.084
4590    0.084
4591    0.084
4592    0.084
4593    0.968
4594    0.084
4595    0.084
4597    0.269
4598    0.084
4599    0.084
4600    0.041
Name: char_freq4, Length: 3681, dtype: float64
0       0.019
1       0.161
2       0.165
3       0.019
4       0.019
5       0.019
6       0.035
7

0       0.08
1       0.26
2       0.83
3       0.23
4       0.23
5       1.45
6       1.52
7       1.48
8       0.21
9       0.21
10      0.40
11      0.02
12      0.06
16      0.54
17      0.40
18      0.71
20      0.40
21      0.36
23      0.76
24      0.40
25      0.36
26      0.40
27      0.40
30      0.40
31      0.40
32      1.49
33      0.40
34      0.40
35      0.40
36      2.54
        ... 
4566    0.05
4567    0.40
4568    0.27
4569    0.15
4570    0.40
4571    0.17
4572    0.40
4573    0.22
4574    0.40
4575    0.40
4576    0.40
4577    0.40
4580    0.40
4582    0.40
4583    0.40
4584    0.40
4586    0.04
4587    0.40
4588    0.40
4589    0.14
4590    0.40
4591    0.40
4592    2.10
4593    0.40
4594    0.40
4595    0.40
4597    0.40
4598    0.40
4599    0.08
4600    0.40
Name: our, Length: 3681, dtype: float64
0       0.06
1       0.01
2       0.06
3       0.06
4       0.06
5       0.06
6       0.06
7       0.06
8       0.06
9       0.06
10      0.06
11      0.06
12      0.0

In [11]:
clfs

[{'label': 1,
  'feature_name': 'char_freq4',
  'threshold': 0.084,
  'alpha': 0.6882377113777169},
 {'label': 1,
  'feature_name': 'char_freq5',
  'threshold': 0.019,
  'alpha': 0.6228088061977658},
 {'label': -1,
  'feature_name': 'hp',
  'threshold': 0.1,
  'alpha': 0.4269492682024884},
 {'label': 1,
  'feature_name': 'remove',
  'threshold': 0.02,
  'alpha': 0.4851532637554392},
 {'label': 1,
  'feature_name': 'cap_run_length_longest',
  'threshold': 10,
  'alpha': 0.36781909063787244},
 {'label': 1,
  'feature_name': 'your',
  'threshold': 0.62,
  'alpha': 0.2619125490356106},
 {'label': -1,
  'feature_name': 'george',
  'threshold': 0.01,
  'alpha': 0.2526422307316929},
 {'label': 1,
  'feature_name': 'free',
  'threshold': 1.08,
  'alpha': 0.27059219164757936},
 {'label': -1,
  'feature_name': 're',
  'threshold': 0.13,
  'alpha': 0.2375124945508494},
 {'label': -1,
  'feature_name': 'char_freq4',
  'threshold': 0.037000000000000005,
  'alpha': 0.1914944565533873},
 {'label': 1,

In [12]:
np.mean(np.array(y_test) == y_pred)

0.9065217391304348

In [13]:
testPredict = pd.DataFrame()
trainPredict = pd.DataFrame()
testPredict['y_predict'] = y_pred
trainPredict['y_predict'] = y_train_pred


testPredict.to_csv('y_test_predict.csv')
trainPredict.to_csv('y_train_predict.csv')