In [16]:
import numpy as np
import pandas as pd
from numpy.linalg import inv

In [17]:
def get_data(column_names):
    data_frame = pd.read_csv('./data/spambase.txt', sep = ',')
    data_frame.columns = column_names
    
    return data_frame

In [18]:
column_names = ['word_freq_make','word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 
               'word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail',
               'word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses',
               'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you',
               'word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money',
               'word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab',
               'word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415',
               'word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm',
               'word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project',
               'word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_;',
               'char_freq_(','char_freq_[','char_freq_!','char_freq_$','char_freq_#','capital_run_length_average',
               'capital_run_length_longest','capital_run_length_total','spam_label']
dataframe = get_data(column_names)
dataframe.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam_label
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,...,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,0.104576,0.212922,0.280578,0.065439,0.312222,0.095922,0.114233,0.105317,0.090087,0.239465,...,0.038583,0.139061,0.01698,0.26896,0.075827,0.044248,5.191827,52.17087,283.290435,0.393913
std,0.305387,1.2907,0.50417,1.395303,0.672586,0.27385,0.39148,0.401112,0.278643,0.644816,...,0.243497,0.270377,0.109406,0.815726,0.245906,0.429388,31.732891,194.912453,606.413764,0.488669
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.2755,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.3825,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.31425,0.052,0.0,3.70525,43.0,265.25,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [19]:
def normalize(dataset):
    
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset

In [20]:
# dataframe = normalize(dataframe)

In [21]:
def train_test_split(dataframe, percent = 20):
    test_data_size = int(len(dataframe) * percent / 100)
    test_data = dataframe[:test_data_size]    
    train_data = dataframe[test_data_size:]
    
    return train_data, test_data

In [22]:
train_data, test_data = train_test_split(dataframe)

In [23]:
def get_weights(train_data):
    x = train_data.drop(['spam_label'], axis = 1).values
    y = train_data['spam_label'].values

    bias = np.ones(len(train_data))
    x = np.append(np.ones([len(x),1]),x,1)
 
    w = np.dot(np.dot(inv(np.dot(x.T, x)), x.T), y)
    return w

In [24]:
weights = get_weights(train_data)

In [25]:
def predict(test_data, weights):
    test_data = test_data.drop(['spam_label'], axis = 1).values
    
    bias = np.ones(len(test_data))
    test_data = np.append(np.ones([len(test_data),1]),test_data,1)
 
    preds = {}
    
    for i in range(len(test_data)):
        preds[i] = rounder(np.dot(test_data[i], weights))
        
    return preds

In [26]:
def rounder(x):
    if x >= 0.26:
        return 1
    return 0

In [27]:
def get_mse(test_data, preds):
    test_labels = test_data['spam_label'].values
    errors = []

    for i, label in enumerate(test_labels):
        errors.append(np.square(label - preds[i]))
    
    mse = pd.Series(errors).mean()
    return mse

In [28]:
preds = predict(test_data, weights)
print('MSE test for SpamBase: {}'.format(get_mse(test_data, preds)))

MSE test for SpamBase: 0.08586956521739131


In [29]:
def accuracy(test_data, preds):
    test_labels = test_data['spam_label'].values
    
    correct_count = 0
    
    for i in range(len(preds)):
        if test_labels[i] == preds[i]:
            correct_count += 1
        
    return correct_count / len(test_labels)

In [30]:
acc = accuracy(test_data, preds)
print('Accuracy for SpamBase: {}'.format(acc))

Accuracy for SpamBase: 0.9141304347826087


In [31]:
preds_train = predict(train_data, weights)
print('MSE train spambase: {}'.format(get_mse(train_data, preds_train)))

MSE train spambase: 0.1092391304347826


In [32]:
acc_train = accuracy(train_data, preds_train)
print('Accuracy for SpamBase: {}'.format(acc_train))

Accuracy for SpamBase: 0.8907608695652174
