In [1]:
#save the spam dataset to the data folder 
import pandas as pd
import numpy 
import math

In [2]:
train = pd.read_csv('data/spambase/spam-train',header=None)
test = pd.read_csv('data/spambase/spam-test',header=None)

In [3]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.0,0.0,0.0,1.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.088,0.0,0.0,0.088,0.0,6.718,33.0,215.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098,0.589,0.0,0.0,0.0,0.0,2.044,22.0,92.0,1.0
2,0.0,0.53,0.0,0.0,1.06,0.0,1.6,0.0,0.0,0.53,...,0.0,0.239,0.079,0.159,0.0,0.0,4.555,51.0,123.0,1.0
3,0.0,0.0,0.23,0.0,0.92,0.0,0.0,0.0,0.23,0.0,...,0.0,0.13,0.026,0.026,0.0,0.026,2.222,23.0,480.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.428,4.0,10.0,0.0


In [4]:
X_train = train.drop(57,axis=1)
y_train = train[:][57]

In [5]:
X_test = test.drop(57,axis=1)
y_test = test[:][57]

In [6]:
#standardize columns to have all mean 0 and unit variance
def zmuv(X_in):
    X = X_in.copy()
    X = (X - X.mean(axis=0))/X.std(axis=0)
    return X

In [7]:
zmuv(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,-0.341197,-0.167892,-0.559543,-0.047041,0.990915,-0.358135,-0.27976,-0.274942,-0.3182,-0.395557,...,-0.114562,-0.160496,-0.219886,-0.164386,-0.449056,0.028504,-0.099663,0.054088,-0.129557,-0.114247
1,-0.341197,-0.167892,-0.559543,-0.047041,-0.45072,-0.358135,-0.27976,-0.274942,-0.3182,-0.395557,...,-0.114562,0.244309,2.059477,-0.164386,-0.449056,-0.297565,-0.099663,-0.091047,-0.215281,-0.306383
2,-0.341197,0.259688,-0.559543,-0.047041,1.062283,-0.358135,3.589996,-0.274942,-0.3182,0.46044,...,-0.114562,-0.160496,0.467108,0.603462,-0.169589,-0.297565,-0.099663,-0.013076,0.010717,-0.257958
3,-0.341197,-0.167892,-0.113615,-0.047041,0.862453,-0.358135,-0.27976,-0.274942,0.505942,-0.395557,...,-0.114562,-0.160496,-0.028801,0.088323,-0.403357,-0.297565,-0.037943,-0.08552,-0.207488,0.299705
4,-0.341197,-0.167892,-0.559543,-0.047041,-0.45072,-0.358135,-0.27976,-0.274942,-0.3182,-0.395557,...,-0.114562,-0.160496,-0.620253,-0.164386,-0.449056,-0.297565,-0.099663,-0.110175,-0.355555,-0.434474


In [8]:
#logistic regression model with gradient descent
def sigmoid(x):
    return 1./(1 + math.exp(-x))

v_sigmoid = numpy.vectorize(sigmoid)

In [9]:
#logistic regression model 
class LogR_model:
    
    def fit(self,X_train,y_train,alpha):
        X = X_train.copy()
        X['bias'] = numpy.ones(X.shape[0])
        
        numpy.random.seed(42)
        w = numpy.random.normal(0,1,size=X.shape[1])
        
        epsilon = 1
        while epsilon > 10e-5:
            dLdw = numpy.dot((v_sigmoid(numpy.dot(X,w)) - y_train),X)
            
            w_new = w - alpha*dLdw
            epsilon = numpy.linalg.norm(w_new - w) #euclidian distance
            w = w_new
            
        self.w = w
        return self
    
    def predict(self,X_test):
        X = X_test.copy()
        X['bias'] = numpy.ones(X.shape[0])
        
        return v_sigmoid(numpy.dot(self.w,X.T)) # sigmoid to squash outputs to [0,1]

In [10]:
#calculate error
def error(y_true,y_hat):
    incorrect = 0.
    for ii in range(len(y_true)):
        incorrect += abs(y_true[ii] - round(y_hat[ii]))
        
    return incorrect/len(y_true)

In [11]:
lr = LogR_model()
lr.fit(zmuv(X_train),y_train,0.00001)

<__main__.LogR_model at 0x116147890>

In [12]:
y_train_hat = lr.predict(zmuv(X_train))
print ('train error:', error(y_train,y_train_hat))

train error: 0.07112561174551386


In [13]:
y_test_hat = lr.predict(zmuv(X_test))
print ('test error:', error(y_test,y_test_hat))

test error: 0.06901041666666667
