# Using regularized logistic regression to classify email

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model
#import sklearn.cross_validation
from sklearn import model_selection
#from sklearn.cross_validation import KFold
import scipy.io
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True,max_iter=1000)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True,max_iter=1000)
    lreg.fit(X,ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda = 0.100
Coefficients = [-4.86009525] [[-2.74747273e-02 -2.25136402e-01  1.21907182e-01  2.27522248e+00
   2.70520048e-01  2.32897315e-01  9.27944257e-01  2.95208723e-01
   1.62385572e-01  6.78970232e-02 -8.31678264e-02 -1.60294620e-01
  -4.73224775e-02  1.09256359e-02  1.88459779e-01  8.20164550e-01
   5.10056275e-01  3.99249443e-02  2.67722637e-01  3.47759627e-01
   2.60466272e-01  3.63354351e-01  7.24506472e-01  1.96760767e-01
  -3.15898998e+00 -4.03875376e-01 -1.25626458e+01 -6.05923619e-02
  -1.55586355e+00 -5.63614779e-02 -3.20084077e-02  4.07346185e-01
  -3.68461579e-01 -1.39284860e+00 -5.81733635e-01  4.43810619e-01
   4.21962305e-02 -1.56968921e-01 -4.55665222e-01 -1.02481037e-01
  -3.52682718e+00 -1.73901534e+00 -4.36519687e-01 -1.06148809e+00
  -9.18491783e-01 -1.75143034e+00 -1.67498490e-01 -9.53333133e-01
  -3.65594020e-01 -1.36280389e-01 -6.58843258e-02  2.06700357e-01
   1.70664664e+00  1.22614438e+00 -3.33799718e-01  1.55454