## Ridge Regularization on SpamBase dataset

In [1350]:
import numpy as np
import pandas as pd
from numpy.linalg import inv

In [1351]:
def get_data(column_names):
    data_frame = pd.read_csv('./data/spambase.txt', sep = ',')
    data_frame.columns = column_names
    
    return data_frame

In [1352]:
column_names = ['word_freq_make','word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 
               'word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail',
               'word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses',
               'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you',
               'word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money',
               'word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab',
               'word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415',
               'word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm',
               'word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project',
               'word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_;',
               'char_freq_(','char_freq_[','char_freq_!','char_freq_$','char_freq_#','capital_run_length_average',
               'capital_run_length_longest','capital_run_length_total','spam_label']
dataframe = get_data(column_names)

In [1353]:
def normalize(dataset):    
    means = dataset.mean(axis = 0)
    dataset = dataset - means
    return dataset, means

In [1354]:
def train_test_split(dataframe, test_percent = 20, val_percent = 15):
    test_data_size = int(len(dataframe) * test_percent / 100)
    test_data = dataframe[:test_data_size]
    train_data = dataframe[test_data_size:]
    
    return train_data, test_data

In [1355]:
train_data, test_data = train_test_split(dataframe)

In [1356]:
def get_regularized_weights(train_data, reg_strength):
    
    x = train_data.drop(['spam_label'], axis = 1).values
    y = train_data['spam_label'].values
    
    # center the data
    z, means = normalize(x) 
    z = np.append(np.ones([len(z),1]),z,1)
    
    I = np.eye(len(z[0]))
    I[0,0] = y.mean()
    
    inverse = inv(np.dot(z.T,z) + reg_strength * I)
    w = np.dot(np.dot(inverse, z.T), y)

    return w, means

In [1357]:
weights, means = get_regularized_weights(train_data, 0.15)

In [1358]:
def predict(test_data, weights, means):
    test_data = test_data.drop(['spam_label'], axis = 1).values
    
    test_data = test_data - means
    test_data = np.append(np.ones([len(test_data),1]),test_data,1)
 
    preds = {}
    
    for i in range(len(test_data)):
        preds[i] = rounder(np.dot(test_data[i], weights))
        
    return preds

In [1359]:
def rounder(x):
    if x >= 0.26:
        return 1
    return 0

In [1360]:
def get_mse(test_data, preds):
    test_labels = test_data['spam_label'].values
    errors = []

    for i, label in enumerate(test_labels):
        errors.append(np.square(label - preds[i]))
    
    mse = pd.Series(errors).mean()
    return mse

In [1361]:
preds = predict(test_data, weights, means)
print('MSE for SpamBase with Ridge Regularization: {}'.format(get_mse(test_data, preds)))

MSE for SpamBase with Ridge Regularization: 0.08586956521739131


In [1362]:
def accuracy(test_data, preds):
    test_labels = test_data['spam_label'].values
    
    correct_count = 0
    
    for i in range(len(preds)):
        if test_labels[i] == preds[i]:
            correct_count += 1
        
    return correct_count / len(test_labels)

In [1363]:
acc = accuracy(test_data, preds)
print('Accuracy for SpamBase with Ridge Regularization: {}'.format(acc))

Accuracy for SpamBase with Ridge Regularization: 0.9141304347826087
