In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import random



### Setup: Parameters and data setup

#### Sigmoid function

$$y = \frac{\mathrm{1}}{\mathrm{1} + e^{-x}}$$

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#### Data Preparation

In [3]:
df = pd.read_csv("./email_spam.csv")

df.tail(100)

Unnamed: 0.1,Unnamed: 0,spam,to_multiple,from,cc,sent_email,image,attach,dollar,winner,inherit,password,num_char,line_breaks,format,re_subj,exclaim_subj,urgent_subj,exclaim_mess,number
3821,3822,1,no,yes,no,no,no,yes,no,no,no,no,6.340,274,HTML,no,yes,no,3,big
3822,3823,1,no,yes,no,no,no,no,no,no,no,no,0.247,9,HTML,no,no,no,0,none
3823,3824,1,no,yes,no,no,no,no,no,no,no,no,13.252,189,Plain,no,no,no,0,none
3824,3825,1,no,yes,no,no,no,no,no,no,no,no,0.150,10,Plain,no,no,no,0,none
3825,3826,1,no,yes,no,no,no,no,no,no,no,no,0.180,7,Plain,yes,no,no,0,none
3826,3827,1,no,yes,no,no,no,no,no,no,no,no,14.019,205,Plain,no,no,no,1,none
3827,3828,1,no,yes,no,no,no,no,no,no,no,no,0.871,15,HTML,no,no,no,1,small
3828,3829,1,no,yes,no,no,no,no,no,no,no,no,2.540,80,HTML,no,no,no,1,small
3829,3830,1,no,yes,no,no,no,yes,no,no,no,no,0.341,18,Plain,no,no,no,0,none
3830,3831,1,no,yes,no,no,no,no,no,no,no,no,0.992,15,HTML,no,no,no,3,small


In [54]:
unnamed_col = data.columns.tolist()[0]
data = df.drop([unnamed_col], axis=1)

data.head()

Unnamed: 0,spam,to_multiple,from,cc,sent_email,image,attach,dollar,winner,inherit,password,num_char,line_breaks,format,re_subj,exclaim_subj,urgent_subj,exclaim_mess,number
0,0,no,yes,no,no,no,no,no,no,no,no,11.37,202,HTML,no,no,no,0,big
1,0,no,yes,no,no,no,no,no,no,no,no,10.504,202,HTML,no,no,no,1,small
2,0,no,yes,no,no,no,no,yes,no,yes,no,7.773,192,HTML,no,no,no,6,small
3,0,no,yes,no,no,no,no,no,no,no,no,13.256,255,HTML,no,no,no,48,small
4,0,no,yes,no,no,no,no,no,no,no,yes,1.231,29,Plain,no,no,no,1,none


In [55]:
# convert strings to numerical values
keywords = ['yes', 'no', 'HTML', 'Plain', 'none', 'big', 'small']
mapping = [1, 0, 0, 1, 0, 1, 2]

data = data.replace(keywords, mapping)

data.head()

Unnamed: 0,spam,to_multiple,from,cc,sent_email,image,attach,dollar,winner,inherit,password,num_char,line_breaks,format,re_subj,exclaim_subj,urgent_subj,exclaim_mess,number
0,0,0,1,0,0,0,0,0,0,0,0,11.37,202,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,0,10.504,202,0,0,0,0,1,2
2,0,0,1,0,0,0,0,1,0,1,0,7.773,192,0,0,0,0,6,2
3,0,0,1,0,0,0,0,0,0,0,0,13.256,255,0,0,0,0,48,2
4,0,0,1,0,0,0,0,0,0,0,1,1.231,29,1,0,0,0,1,0


In [56]:
# normalize data
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05981,0.049988,0.0,0.0,0.0,0.0,0.0,0.5
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055254,0.049988,0.0,0.0,0.0,0.0,0.000809,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.040887,0.047501,0.0,0.0,0.0,0.0,0.004854,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069732,0.063168,0.0,0.0,0.0,0.0,0.038835,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.006471,0.006963,1.0,0.0,0.0,0.0,0.000809,0.0


In [67]:
# split to train and test sets for holdout crossvalidation
features = data.iloc[:,1:].values
labels = data.iloc[:,0].values

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3136, 18), (3136,), (785, 18), (785,))

#### Log odds or Logits of the independent variables

for,

$$ \beta_{0} + \beta_1 x_1 + \beta_1 x_1 + \dots + \beta_n x_n $$

General Model,

$$ logit(P_{disease}) = \log{\bigg(\frac{P_{disease}}{1 - P_{disease}}\bigg)} = \beta_{0} + \beta_1 x_1 + \beta_1 x_1 + \dots + \beta_n x_n $$

logistics function:

$$ \operatorname{Pr} = \frac{\exp(\beta_{0} + \beta_{1} x_1 + \beta_{2} x_2 + \dots + \beta_{n} x_n)} {1 + \exp(\beta_{0} + \beta_{1} x_1 + \beta_{2} x_2 + \dots + \beta_{n} x_n)} \label{eq:glm1} $$

#### Explanation of a Single Newton Step

Newton's method for maximizing / minimizing a given function $f(\beta)$ iteratively computes the following estimate:

$$
\beta^+ = \beta - Hf(\beta)^{-1}\nabla f(\beta)
$$

The Hessian of the log-likelihood for logistic regression is given by:

hessian of our function = **X => negative tranpose of (N times (p+1))** times **W => (N x N diagional matrix of weights, each is p*(1-p))** times **X** again

$$
Hf(\beta) = -X^TWX
$$
and the gradient is:

gradient of our function = **tranpose of X** times (**column vector** - **N vector of probabilities**)

$$
\nabla f(\beta) = X^T(y-p)
$$
where $$
W := \text{diag}\left(p(1-p)\right)
$$ and $p$ are the predicted probabilites computed at the current value of $\beta$.

In [250]:
def newtons_method_steps(current_beta, X, Y, learning_rate=0.01):
    # create probability matrix, miniminum 2 dimensions, tranpose (flip it)
    p = np.array(sigmoid(X.dot(current_beta[:,0])), ndmin=2).T
    # create weight matrix
    W = np.diag((p * (1-p))[:,0])
    # derive the hessian 
    hessian = X.T.dot(W).dot(X)
    # derive the gradient
    gradient = X.T.dot(Y-p)
    
    # invert of Hessian
    hessian = np.linalg.inv(hessian)
    
    step = learning_rate * np.matmul(hessian, gradient)
    
    # updated beta
    return current_beta[:,] + step[:,]

### Training

In [70]:
def is_coefs_converged(old_beta, new_beta, current_iterations, tolerance = 1e-8):
    # calculate the change in the coefficients
    coef_change = np.abs(old_beta - new_beta)
    
    # if change hasn't reached the threshold and we have more iterations to go, keep training
    return not (np.any(coef_change > tolerance) & (current_iterations < max_iterations))

In [251]:
# Convergance tolerance
tol = 1e-8

# Max iterations
max_iterations = 100

feature_size = x_test.shape[1]

## initial conditions
# initial coefficients (weight values), 2 copies, we'll update one
beta_old, beta = np.ones((feature_size,1)), np.zeros((feature_size,1))

# num iterations we've done so far
iteration_count = 0

# have we reached convergence?
coefs_converged = False

# if we haven't reached convergence... (keep training)
while not coefs_converged:
    # set the old coefficients to our current
    beta_old = beta
    
    # perform a single step of newton's optimization on our data, set our updated beta values
    beta = newtons_method_steps(beta, x_train, y_train)
    
    # increment the number of iterations
    iteration_count += 1
    
    # check for convergence between our old and new beta values
    coefs_converged = is_coefs_converged(beta_old, beta, iteration_count, tol)
    
print('Iterations : {}'.format(iteration_count))
print('Beta : {}'.format(beta))
print('Size of Beta: {}'.format(beta.shape))

Iterations : 100
Beta : [[ 2.59571359e-03  2.59571359e-03  2.59571359e-03 ...  2.59571359e-03
  -6.61127717e-03  2.59571359e-03]
 [-1.47798131e+00 -1.47798131e+00 -1.47798131e+00 ... -1.47798131e+00
   3.38726337e+00 -1.47798131e+00]
 [-4.41035375e-04 -4.41035375e-04 -4.41035375e-04 ... -4.41035375e-04
   1.12678339e-03 -4.41035375e-04]
 ...
 [-9.32780058e-03 -9.32780058e-03 -9.32780058e-03 ... -9.32780058e-03
   2.37613854e-02 -9.32780058e-03]
 [ 5.92486436e-03  5.92486436e-03  5.92486436e-03 ...  5.92486436e-03
  -1.50785974e-02  5.92486436e-03]
 [-1.25770984e-02 -1.25770984e-02 -1.25770984e-02 ... -1.25770984e-02
   3.20432483e-02 -1.25770984e-02]]
Size of Beta: (18, 3136)


In [252]:
# do testing
sig_out_test = [0] * x_test.shape[0]
diff_test = [0] * x_test.shape[0]
for k in range(x_test.shape[0]):
    # compute sigmoid outputs
    sig_out_test[k] = sigmoid(np.dot(x_test[k], beta[:, ]))
    diff_test[k] = sig_out_test[k] - y_test[k]
    
    print("for {} ".format(k), sig_out_test[k])
    break
    
y_pred = np.array(sigmoid(x_test.dot(beta[:,0])), ndmin=2).T
y_pred.shape

for 0  [0.18267326 0.18267326 0.18267326 ... 0.18267326 0.96890543 0.18267326]


(785, 1)

In [253]:
op = pd.DataFrame({'predicted': y_pred[:,0], 'actual': y_test})

op['predicted_binary'] = op.apply(lambda row: 1 if row.predicted > 0.5 else 0, axis=1)

op[['actual', 'predicted']][op.actual != op.predicted_binary]

Unnamed: 0,actual,predicted
6,1.0,0.185731
10,1.0,0.183837
36,1.0,0.183836
50,1.0,0.183818
74,1.0,0.182069
90,1.0,0.183835
115,1.0,0.184459
118,1.0,0.183372
120,1.0,0.183811
127,1.0,0.182303


In [173]:
y_pred = sigmoid(np.dot(x_test, beta[:,1]))

y_pred[120], y_test[120]

(7.240372963762128e-06, 1.0)

In [261]:
pd.DataFrame(beta)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3126,3127,3128,3129,3130,3131,3132,3133,3134,3135
0,0.002596,0.002596,0.002596,0.002596,0.002596,0.002596,0.002596,-0.006611,-0.006611,-0.006611,...,0.002596,0.002596,0.002596,0.002596,0.002596,0.002596,0.002596,0.002596,-0.006611,0.002596
1,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,3.387263,3.387263,3.387263,...,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,-1.477981,3.387263,-1.477981
2,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,0.001127,0.001127,0.001127,...,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,-0.000441,0.001127,-0.000441
3,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,0.011169,0.011169,0.011169,...,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,-0.004384,0.011169,-0.004384
4,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,0.002541,0.002541,0.002541,...,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,-0.000997,0.002541,-0.000997
5,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,0.002827,0.002827,0.002827,...,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,-0.00111,0.002827,-0.00111
6,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,0.02332,0.02332,0.02332,...,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,0.02332,-0.009151
7,0.003694,0.003694,0.003694,0.003694,0.003694,0.003694,0.003694,-0.009415,-0.009415,-0.009415,...,0.003694,0.003694,0.003694,0.003694,0.003694,0.003694,0.003694,0.003694,-0.009415,0.003694
8,0.004687,0.004687,0.004687,0.004687,0.004687,0.004687,0.004687,-0.011945,-0.011945,-0.011945,...,0.004687,0.004687,0.004687,0.004687,0.004687,0.004687,0.004687,0.004687,-0.011945,0.004687
9,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,0.002988,0.002988,0.002988,...,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,-0.001173,0.002988,-0.001173


In [254]:
pd.DataFrame(y_test)[0][(pd.DataFrame(y_test)[0] > 0)]

6      1.0
10     1.0
36     1.0
50     1.0
74     1.0
90     1.0
115    1.0
118    1.0
120    1.0
127    1.0
138    1.0
139    1.0
141    1.0
143    1.0
150    1.0
166    1.0
168    1.0
175    1.0
194    1.0
206    1.0
242    1.0
272    1.0
278    1.0
296    1.0
320    1.0
332    1.0
366    1.0
380    1.0
383    1.0
385    1.0
      ... 
481    1.0
497    1.0
498    1.0
503    1.0
507    1.0
514    1.0
520    1.0
532    1.0
535    1.0
539    1.0
545    1.0
578    1.0
585    1.0
588    1.0
601    1.0
604    1.0
624    1.0
629    1.0
631    1.0
637    1.0
641    1.0
657    1.0
660    1.0
668    1.0
684    1.0
695    1.0
719    1.0
744    1.0
756    1.0
772    1.0
Name: 0, Length: 71, dtype: float64

In [259]:
# y_pred[120]
pd.DataFrame(y_pred)[0][(pd.DataFrame(y_pred)[0] > 0.19)]

Series([], Name: 0, dtype: float64)