In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv
import random
import math


In [2]:
word_labels = ["make", "address", "all", "3d", "our", "over", "remove", "internet",
                "order", "mail", "receive", "will", "people", "report", "addresses",
                "free", "business", "email", "you", "credit", "your", "font", "000",
                "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
                "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
                "meeting", "original", "project", "re", "edu", "table", "conference", "char_freq1", "char_freq2", "char_freq3", 
              "char_freq4", "char_freq5", "char_freq6", "cap_run_length_avg", "cap_run_length_longest", "cap_run_length_total", "label"]
df = pd.read_csv("spambase/spambase.data", names = word_labels, header=None) 
df_norm = df.iloc[:, :-1]
df_norm = (df_norm - df_norm.mean()) / df_norm.std()
df = df_norm.join(df.iloc[:, -1])

In [3]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    index_list = df.index.tolist()
    test_indices = random.sample(population=index_list, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [4]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.20)

In [5]:
X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]
m,n = np.shape(X)
x0 = np.ones((m,1))
X = np.c_[x0,X]

In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [8]:
def hessian(X,h):
    return np.dot(X.T,(np.diag(h*(1-h)).dot(X))) 


In [7]:
def cost_func(h, y):
    return np.sum((y * np.log(h) + (1 - y) * np.log(1 - h)))

In [9]:
def Newton_logistic(X,y):
    w = np.zeros(X.shape[1])
    cost=0
    for i in range(1000):

        hypothesis = sigmoid(np.dot(X, w))
        gradient = np.dot(X.T, (y-hypothesis)) 
        hessian_inv=pinv(hessian(X,hypothesis))
        w+=hessian_inv.dot(gradient) 
        prev_cost = cost
        cost=cost_func(hypothesis,y) 

        if cost-prev_cost==0:
            break;

    return w

In [10]:
X

array([[ 1.        , -0.34239649, -0.16505397, ..., -0.05214446,
        -0.06245937, -0.15220518],
       [ 1.        , -0.34239649, -0.16505397, ..., -0.05214446,
        -0.06245937, -0.15220518],
       [ 1.        , -0.34239649, -0.16505397, ..., -0.0690688 ,
        -0.190736  , -0.3781481 ],
       ...,
       [ 1.        , -0.34239649, -0.16505397, ..., -0.11461009,
        -0.24717771, -0.44411683],
       [ 1.        ,  0.6400583 , -0.16505397, ..., -0.11936908,
        -0.23691558, -0.27259812],
       [ 1.        , -0.34239649, -0.16505397, ..., -0.12422262,
        -0.24204665, -0.40123715]])

In [11]:
w = Newton_logistic(X,y)
w

  


array([-5.94527476e+01, -1.33449187e-01, -1.85749716e-01,  6.24221512e-02,
        3.79534269e+00,  4.20289344e-01,  2.32266433e-01,  9.91509565e-01,
        2.10457076e-01,  1.30751029e-01,  7.71657310e-02, -5.42988414e-02,
       -1.18413564e-01, -3.33087973e-02,  2.77924477e-02,  4.09812115e-01,
        7.81961346e-01,  3.40764088e-01,  5.86983604e-02,  1.02157272e-01,
        5.83730334e-01,  2.81416003e-01,  1.45800147e-01,  9.10905965e-01,
        3.08683675e-01, -2.66582400e+00, -7.28951663e-01, -6.12895989e+01,
        2.33120530e-01, -2.09177510e+00, -5.99440667e-01,  2.72769432e-01,
        7.55766121e-01, -5.78066702e-01,  1.56771264e-01, -1.18698671e+00,
        3.47795633e-01, -1.05418297e-02, -1.20614892e-01, -4.59790579e-01,
       -1.20886828e-01, -3.67179751e+02, -2.10977447e+00, -2.32631229e-01,
       -9.76409242e-01, -7.51189050e-01, -1.20893991e+00, -2.04529540e-01,
       -1.14114171e+00, -2.74997275e-01, -5.05540079e-02, -7.30473954e-02,
        4.18940777e-01,  

In [12]:
y_pred = w.T.dot(X.T)

In [13]:
y_pred

array([ 1.20454712,  1.20389388,  0.72672328, ..., -3.22468089,
       -3.41772128, -3.38060643])

In [14]:
threshold = 0.0
y_pred = []
for i in range(0, len(X-1)):
    val = np.dot(X[i], w)   

    if val > threshold:                               
        y_predict = 1                               
    else:                                   
        y_predict = 0
    y_pred.append(y_predict)

In [15]:
print("Accuracy:", np.mean(y == y_pred)) 

Accuracy: 0.9331703341483293


In [16]:
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
m,n = np.shape(X_test)
x0 = np.ones((m,1))
X_test = np.c_[x0,X_test]

In [17]:
threshold = 0.0
y_pred_test = []
for i in range(0, len(X_test-1)):
    val = np.dot(X_test[i], w)   

    if val > threshold:                               
        y_predict = 1                               
    else:                                   
        y_predict = 0
    y_pred_test.append(y_predict)

In [18]:
print("Accuracy:", np.mean(y_test == y_pred_test)) 

Accuracy: 0.9260869565217391
