# Logistic Regression, L2 penalty (Logistic Ridge)
* `sbmi25` -- Logistic Ridge, SAG solver (LogisticRegression)
* `sbmi58` -- Logistic Ridge, L-BFGS solver
* `sbmi59` -- Logistic Ridge, Newton-CG solver
* `sbmi60` -- Logistic Ridge, SGD solver
* `sbmi93` -- Logistic Ridge, SAGA solver 
* `sbmi94` -- Logistic Ridge, SAG solver (RidgeClassifier)
* `sbmi95` -- Logistic Ridge, CG solver 
* `sbmi96` -- Logistic Ridge, LSQR inverse 
* `sbmi97` -- Logistic Ridge, SVD inverse 


In [1]:
# add path
import sys; import os; sys.path.append(os.path.realpath("../"))

# general hyperparameter optimization settings
from seasalt import (select_the_best, refit_model) 
from seasalt.sb import (cv_settings, scorerfun, print_scores)
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# demo datasets
from datasets.demo1 import X_train, Y_train, fold_ids, X_valid, Y_valid, meta as meta_data
#meta_data

In [3]:
# model implementation
#from potpourri.sbmi25 import model, hyper, meta  # SAG
#from potpourri.sbmi58 import model, hyper, meta  # L-BFGS
#from potpourri.sbmi59 import model, hyper, meta  # Newton-CG
from potpourri.sbmi60 import model, hyper, meta  # SGD
#from potpourri.sbmi93 import model, hyper, meta  # SAGA
#from potpourri.sbmi94 import model, hyper, meta  # SAG, RidgeClassifier
#from potpourri.sbmi95 import model, hyper, meta  # CG
#from potpourri.sbmi96 import model, hyper, meta  # LSQR
#from potpourri.sbmi97 import model, hyper, meta  # SVD
meta

{'id': 'sbmi60',
 'name': 'Logistic Ridge',
 'descriptions': 'Logistic Regression, L2 penalty (Ridge), SGD solver, standard-normal transformed features.',
 'solver': 'Stochastic Gradient Descent (SGD)',
 'active': True,
 'keywords': ['binary classification', 'linear regression', 'SGDClassifier'],
 'output_num': 'single',
 'output_scale': 'binary',
 'output_dtype': 'bool',
 'input_num': 'multi',
 'input_scale': 'interval',
 'input_dtype': 'float'}

## Train

In [4]:
%%time
rscv = RandomizedSearchCV(**{'estimator': model, 'param_distributions': hyper}, **cv_settings)
rscv.fit(X = X_train, y = Y_train)  # Run CV

bestparam, summary = select_the_best(rscv)  # find the "best" parameters
bestmodel = refit_model(model, bestparam, X_train, Y_train)  # Refit the "best" model



CPU times: user 901 ms, sys: 148 ms, total: 1.05 s
Wall time: 11.5 s


In [5]:
#rscv.cv_results_

## Evaluate

In [6]:
print("Infer/predict on validation set")
Y_pred = bestmodel.predict(X_valid)

print("\nOut of sample score")
print(scorerfun(Y_valid, Y_pred))

print("\nOut of sample score (Other metrics)")
print_scores(Y_pred, Y_valid)

print("\nBest model parameters")
print(bestparam)

print("\nIn-sample scores and model variants (from CV)")
summary

Infer/predict on validation set

Out of sample score
0.9136643851522572

Out of sample score (Other metrics)
            Matthews: 0.913664
           Accurancy: 0.969565
             Jaccard: 0.959064
             Hamming: 0.040936
           Precision: 1.000000
              Recall: 0.939130

Best model parameters
{'lin__alpha': 0.3452217433729583}

In-sample scores and model variants (from CV)


Unnamed: 0,lin__alpha,cvratio,rank_test_score,mean_test_score,std_test_score,mean_fit_time,std_fit_time
49,0.345222,44.036899,5,0.878848,0.019957,0.084287,0.008102
19,0.249595,38.359352,2,0.894944,0.023331,0.084305,0.005978
10,0.26531,38.359352,2,0.894944,0.023331,0.086006,0.00804
21,0.197711,34.374516,4,0.889121,0.025866,0.082133,0.002748
9,0.05611,30.417159,1,0.93143,0.030622,0.10888,0.015797


In [7]:
Y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

## Parameters

In [8]:
bestmodel.steps[1][1].coef_

array([[-0.18167358, -0.15655443, -0.18152938, -0.16866058, -0.08632365,
        -0.09248691, -0.14447016, -0.19729478, -0.05158211,  0.07148875,
        -0.14105141,  0.00154973, -0.12182765, -0.11899139,  0.00043075,
         0.00758878,  0.0158048 , -0.06808006,  0.03155044,  0.07057446,
        -0.2025012 , -0.17834805, -0.19727723, -0.17868358, -0.1397241 ,
        -0.12600209, -0.15949041, -0.20936091, -0.12966389, -0.04188578]])

## Debug, Memory, Misc

In [9]:
#del summary
#locals()
%whos

Variable             Type                  Data/Info
----------------------------------------------------
RandomizedSearchCV   ABCMeta               <class 'sklearn.model_sel<...>arch.RandomizedSearchCV'>
X_train              ndarray               398x30: 11940 elems, type `float64`, 95520 bytes
X_valid              ndarray               171x30: 5130 elems, type `float64`, 41040 bytes
Y_pred               ndarray               171: 171 elems, type `int64`, 1368 bytes
Y_train              ndarray               398: 398 elems, type `int64`, 3184 bytes
Y_valid              ndarray               171: 171 elems, type `int64`, 1368 bytes
bestmodel            Pipeline              Pipeline(memory=None,\n  <...>se=0, warm_start=True))])
bestparam            dict                  n=1
cv_settings          dict                  n=6
fold_ids             ndarray               398: 398 elems, type `int64`, 3184 bytes
hyper                dict                  n=1
meta                 dict           