# Regularized Classification 

- Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)
- **Goal:** Predict the origin of wine using chemical analysis

### Load and prepare the wine dataset

In [1]:
# read in the dataset
import pandas as pd
url = './Datasets/wine.data'
wine = pd.read_csv(url, header=None)
wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [2]:
# examine the response variable
wine[0].value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

In [3]:
# define X and y
X = wine.drop(0, axis=1)
y = wine[0]

In [4]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Logistic regression (unregularized)

In [5]:
# build a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# examine the coefficients
print (logreg.coef_)

[[ -3.99122559e-01   7.19280930e-01   8.37692346e-01  -6.15868919e-01
   -3.70379019e-02  -2.82429818e-03   1.16591001e+00   5.50064150e-02
   -3.01114676e-01  -1.99446284e-01  -9.27763372e-02   9.50747899e-01
    1.62914287e-02]
 [  8.95380409e-01  -1.25706949e+00  -7.99283789e-01   2.31762996e-01
    2.24082745e-02   3.58806120e-01   5.15658221e-01   1.70236041e-01
    7.77599666e-01  -1.70661033e+00   4.95926028e-01  -2.74442486e-01
   -1.39060755e-02]
 [ -3.48621047e-01   7.05375604e-01   1.11108906e-01   2.02375183e-01
   -1.34734684e-02  -6.06889858e-01  -1.85854094e+00  -3.86951461e-02
   -7.05962486e-01   1.08803431e+00  -3.93735373e-01  -9.41034622e-01
    1.02444280e-03]]


In [7]:
# generate predicted probabilities
y_pred_prob = logreg.predict_proba(X_test)
print (y_pred_prob)

[[  4.18503497e-03   1.31818492e-02   9.82633116e-01]
 [  3.84528941e-05   9.98810551e-01   1.15099615e-03]
 [  9.85235001e-01   1.31000647e-02   1.66493420e-03]
 [  1.70972460e-02   9.81721849e-01   1.18090522e-03]
 [  9.88619406e-01   9.44466886e-06   1.13711498e-02]
 [  2.10450753e-03   2.60259922e-02   9.71869500e-01]
 [  7.55845820e-02   8.28218255e-01   9.61971626e-02]
 [  9.98943579e-01   2.74212513e-07   1.05614650e-03]
 [  5.93269706e-04   1.20002097e-03   9.98206709e-01]
 [  3.22055476e-04   9.67957409e-01   3.17205351e-02]
 [  9.92236002e-01   3.54161314e-04   7.40983709e-03]
 [  1.62195548e-01   8.31946095e-01   5.85835704e-03]
 [  1.90618519e-04   9.99075621e-01   7.33760182e-04]
 [  9.98368175e-01   1.31699657e-03   3.14828379e-04]
 [  1.56512361e-02   9.83719817e-01   6.28947070e-04]
 [  3.77680447e-04   9.94445742e-01   5.17657781e-03]
 [  5.03774025e-07   4.12839671e-01   5.87159825e-01]
 [  9.28633880e-01   6.87946294e-02   2.57149095e-03]
 [  1.30352993e-04   9.91328

In [8]:
# calculate log loss
from sklearn import metrics
print (metrics.log_loss(y_test, y_pred_prob))

0.125171555231


### Logistic regression (regularized)

- [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) documentation
- **C:** must be positive, decrease for more regularization
- **penalty:** l1 (lasso) or l2 (ridge)

In [9]:
# standardize X_train and X_test
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# try C=0.1 with L1 penalty
logreg = LogisticRegression(C=0.1, penalty='l1')
logreg.fit(X_train_scaled, y_train)
print (logreg.coef_)

[[ 0.21034042  0.          0.          0.          0.          0.
   0.48870268  0.          0.          0.          0.          0.15214146
   1.47740159]
 [-0.65723336 -0.05608877 -0.11399467  0.          0.          0.          0.
   0.          0.         -0.73814175  0.24428926  0.         -0.63398233]
 [ 0.          0.          0.          0.          0.          0.
  -0.84171724  0.          0.          0.61529669 -0.49029377 -0.30484304
   0.        ]]


In [11]:
# generate predicted probabilities and calculate log loss
y_pred_prob = logreg.predict_proba(X_test_scaled)
print (metrics.log_loss(y_test, y_pred_prob))

0.362230079986


In [12]:
# try C=0.1 with L2 penalty
logreg = LogisticRegression(C=0.1, penalty='l2')
logreg.fit(X_train_scaled, y_train)
print (logreg.coef_)

[[ 0.59163934  0.06886667  0.33592964 -0.49616684  0.111539    0.21570086
   0.40524509 -0.15526139 -0.02534651  0.05399014  0.14877346  0.42327938
   0.89815007]
 [-0.73545676 -0.32942948 -0.47995296  0.294866   -0.1500246   0.04264373
   0.14500586  0.07250763  0.17409795 -0.70726652  0.4128986   0.09997212
  -0.81284365]
 [ 0.20136567  0.30989025  0.15977925  0.18867218  0.04204443 -0.27108109
  -0.55886639  0.07486943 -0.17471153  0.68266464 -0.52385748 -0.49566967
  -0.02565631]]


In [13]:
# generate predicted probabilities and calculate log loss
y_pred_prob = logreg.predict_proba(X_test_scaled)
print (metrics.log_loss(y_test, y_pred_prob))

0.244588324539


In [14]:
# pipeline of StandardScaler and LogisticRegression
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [15]:
import numpy as np

# grid search for best combination of C and penalty
from sklearn.grid_search import GridSearchCV
C_range = 10.**np.arange(-2, 3)
penalty_options = ['l1', 'l2']
param_grid = dict(logisticregression__C=C_range, logisticregression__penalty=penalty_options)
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='log_loss')
grid.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02]), 'logisticregression__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [16]:
# print all log loss scores
grid.grid_scores_

[mean: -1.09861, std: 0.00000, params: {'logisticregression__C': 0.01, 'logisticregression__penalty': 'l1'},
 mean: -0.62547, std: 0.03037, params: {'logisticregression__C': 0.01, 'logisticregression__penalty': 'l2'},
 mean: -0.35491, std: 0.06893, params: {'logisticregression__C': 0.10000000000000001, 'logisticregression__penalty': 'l1'},
 mean: -0.26801, std: 0.04840, params: {'logisticregression__C': 0.10000000000000001, 'logisticregression__penalty': 'l2'},
 mean: -0.09431, std: 0.06113, params: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l1'},
 mean: -0.10371, std: 0.04894, params: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l2'},
 mean: -0.05823, std: 0.06375, params: {'logisticregression__C': 10.0, 'logisticregression__penalty': 'l1'},
 mean: -0.06174, std: 0.05651, params: {'logisticregression__C': 10.0, 'logisticregression__penalty': 'l2'},
 mean: -0.07046, std: 0.09259, params: {'logisticregression__C': 100.0, 'logisticregression__penalty

In [17]:
# examine the best model
print (grid.best_score_)
print (grid.best_params_)

-0.0582343588222
{'logisticregression__C': 10.0, 'logisticregression__penalty': 'l1'}


## Comparing regularized linear models with unregularized linear models

**Advantages of regularized linear models:**

- Better performance
- L1 regularization performs automatic feature selection
- Useful for high-dimensional problems (p > n)

**Disadvantages of regularized linear models:**

- Tuning is required
- Feature scaling is recommended