In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.optimize as sp
import random
from pprint import pprint

In [2]:
word_labels = ["make", "address", "all", "3d", "our", "over", "remove", "internet",
                "order", "mail", "receive", "will", "people", "report", "addresses",
                "free", "business", "email", "you", "credit", "your", "font", "000",
                "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
                "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
                "meeting", "original", "project", "re", "edu", "table", "conference", "char_freq1", "char_freq2", "char_freq3", 
              "char_freq4", "char_freq5", "char_freq6", "cap_run_length_avg", "cap_run_length_longest", "cap_run_length_total", "label"]
df = pd.read_csv("spambase/spambase.data", names = word_labels, header=None) 


In [3]:
df_norm = df.iloc[:, :-1]
df_norm = (df_norm - df_norm.mean()) / (df_norm.max() - df_norm.min())
df = df_norm.join(df.iloc[:, -1])
df

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,-0.023029,0.029901,0.070460,-0.001528,0.000778,-0.016310,-0.015709,-0.009477,-0.017123,-0.013169,...,-0.008797,-0.014257,-0.004160,0.015670,-0.012629,-0.002231,-0.001303,0.000884,-0.000334,1
1,0.023226,0.004691,0.043009,-0.001528,-0.017222,0.031309,0.013176,-0.003177,-0.017123,0.038536,...,-0.008797,-0.000721,-0.004160,0.003169,0.017356,0.000190,-0.000070,0.004889,0.047015,1
2,-0.009814,-0.014917,0.084185,-0.001528,0.091778,0.016003,0.010425,0.001324,0.104550,0.000582,...,-0.006516,0.000407,-0.004160,0.000213,0.018023,-0.001727,0.004203,0.043335,0.124729,1
3,-0.023029,-0.014917,-0.055031,-0.001528,0.031778,-0.016310,0.026932,0.047228,0.041812,0.021484,...,-0.008797,-0.000208,-0.004160,-0.004066,-0.012629,-0.002231,-0.001502,-0.001219,-0.005826,1
4,-0.023029,-0.014917,-0.055031,-0.001528,0.031778,-0.016310,0.026932,0.047228,0.041812,0.021484,...,-0.008797,-0.000413,-0.004160,-0.004128,-0.012629,-0.002231,-0.001502,-0.001219,-0.005826,1
5,-0.023029,-0.014917,-0.055031,-0.001528,0.153778,-0.016310,-0.015709,0.157039,-0.017123,-0.013169,...,-0.008797,0.008610,-0.004160,-0.008285,-0.012629,-0.002231,-0.001990,-0.003722,-0.014475,1
6,-0.023029,-0.014917,-0.055031,-0.001528,0.160778,-0.016310,-0.015709,-0.009477,-0.017123,0.022034,...,-0.008797,-0.008719,-0.004160,-0.003235,-0.003633,-0.002231,-0.003196,-0.004823,-0.010814,1
7,-0.023029,-0.014917,-0.055031,-0.001528,0.156778,-0.016310,-0.015709,0.159739,-0.017123,-0.013169,...,-0.008797,0.006867,-0.004160,-0.008285,-0.012629,-0.002231,-0.002489,-0.004122,-0.014791,1
8,0.010010,-0.014917,0.035165,-0.001528,0.029778,-0.016310,0.025556,-0.009477,0.157782,0.028635,...,-0.008797,0.013533,-0.004160,-0.002712,0.021188,-0.001121,0.004133,0.039330,0.061472,1
9,-0.009814,-0.006514,0.095950,-0.001528,-0.012222,0.038112,0.036560,-0.009477,-0.005716,-0.013169,...,0.000325,-0.011180,-0.004160,-0.000772,0.000864,-0.002231,-0.003143,-0.000918,0.029401,1


In [4]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [5]:
random.seed(1)
train_df, test_df = train_test_split(df, 0.2)
train_df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,-0.023029,0.029901,0.07046,-0.001528,0.000778,-0.01631,-0.015709,-0.009477,-0.017123,-0.013169,...,-0.008797,-0.014257,-0.00416,0.01567,-0.012629,-0.002231,-0.001303,0.000884,-0.000334,1
1,0.023226,0.004691,0.043009,-0.001528,-0.017222,0.031309,0.013176,-0.003177,-0.017123,0.038536,...,-0.008797,-0.000721,-0.00416,0.003169,0.017356,0.00019,-7e-05,0.004889,0.047015,1
2,-0.009814,-0.014917,0.084185,-0.001528,0.091778,0.016003,0.010425,0.001324,0.10455,0.000582,...,-0.006516,0.000407,-0.00416,0.000213,0.018023,-0.001727,0.004203,0.043335,0.124729,1
3,-0.023029,-0.014917,-0.055031,-0.001528,0.031778,-0.01631,0.026932,0.047228,0.041812,0.021484,...,-0.008797,-0.000208,-0.00416,-0.004066,-0.012629,-0.002231,-0.001502,-0.001219,-0.005826,1
4,-0.023029,-0.014917,-0.055031,-0.001528,0.031778,-0.01631,0.026932,0.047228,0.041812,0.021484,...,-0.008797,-0.000413,-0.00416,-0.004128,-0.012629,-0.002231,-0.001502,-0.001219,-0.005826,1


In [6]:
def sigmoid(theta, X): 

    return 1.0/(1 + np.exp(-np.dot(X, theta.T))) 


In [7]:
def gradient(theta, X, y): 

    hypothesis_cost = sigmoid(theta, X) - y.reshape(X.shape[0], -1) 
    gradient = np.dot(hypothesis_cost.T, X) 
    return gradient 
  

In [8]:
def cost_function(theta, X, y): 

    hypothesis = sigmoid(theta, X) 
    y = np.squeeze(y) 
    term1 = y * np.log(hypothesis) 
    term2 = (1 - y) * np.log(1 - hypothesis) 
    calculate_cost = -term1 - term2 
    return np.mean(calculate_cost) 
  

In [9]:
def gradient_descent(X, y, theta, alpha=.001, max_cost=.001): 

    cost = cost_function(theta, X, y) 
    change_cost = np.inf
      
    while(change_cost > max_cost): 
        old_cost = cost 
        theta = theta - (alpha * gradient(theta, X, y)) 
        cost = cost_function(theta, X, y) 
        change_cost = old_cost - cost 
      
    return theta
  

In [10]:
def predict_values(theta, X): 

    pred_prob = sigmoid(theta, X) 
    pred_value = np.where(pred_prob >= .5, 1, 0) 
    return np.squeeze(pred_value) 

In [11]:

X = train_df.iloc[:, :-1]
X = np.array(X)


X = np.hstack((np.matrix(np.ones(X.shape[0])).T, X)) 

 
y = train_df.iloc[:, -1] 
y = np.array(y)

theta = np.matrix(np.zeros(X.shape[1])) 


theta = gradient_descent(X, y, theta) 


y_pred = predict_values(theta, X) 

print("Accuracy:", np.mean(y == y_pred)) 


Correctly predicted labels: 0.9299103504482478


In [12]:
X_test = test_df.iloc[:, :-1]
X_test = np.array(X_test)


X_test = np.hstack((np.matrix(np.ones(X_test.shape[0])).T, X_test)) 


y_test = test_df.iloc[:, -1] 
y_test = np.array(y_test)

In [13]:
y_pred_test = predict_values(theta, X_test) 

In [14]:
print("Accuracy:", np.mean(y_test == y_pred_test)) 

Correctly predicted labels: 0.9184782608695652


In [1]:
thresholds = np.linspace(2,-2,105)

ROC = np.zeros((105,2))

for i in range(105):
    t = thresholds[i]

    TP_t = np.logical_and( y_pred_test > t, y_test==1 ).sum()
    TN_t = np.logical_and( y_pred_test <=t, y_test==0 ).sum()
    FP_t = np.logical_and( y_pred_test > t, y_test==0 ).sum()
    FN_t = np.logical_and( y_pred_test <=t, y_test==1 ).sum()

    FPR_t = FP_t / float(FP_t + TN_t)
    ROC[i,0] = FPR_t

    TPR_t = TP_t / float(TP_t + FN_t)
    ROC[i,1] = TPR_t

# Plot the ROC curve.
fig = plt.figure(figsize=(6,6))
plt.plot(ROC[:,0], ROC[:,1], lw=2)
plt.xlabel('$FPR(t)$')
plt.ylabel('$TPR(t)$')
plt.grid()

NameError: name 'np' is not defined

In [16]:
AUC = 0.
for i in range(100):
    AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1])
AUC *= 0.5
AUC

0.9103401931541686