# Logistic Regression with L1 penalty (Logistic Lasso)
* `sbmi1` -- Logistic Lasso with SAGA solver
* `sbmi56` -- Logistic Lasso with liblinear solver
* `sbmi57` -- Logistic Lasso with SGD solver

In [1]:
# add path
import sys; import os; sys.path.append(os.path.realpath("../"))

# general hyperparameter optimization settings
from seasalt import (select_the_best, refit_model) 
from seasalt.sb import (cv_settings, scorerfun, print_scores)
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# demo datasets
from datasets.demo1 import X_train, Y_train, fold_ids, X_valid, Y_valid, meta as meta_data
#meta_data

In [3]:
# model implementations
#from potpourri.sbmi1 import model, hyper, meta  # SAGA
#from potpourri.sbmi56 import model, hyper, meta  # liblinear
from potpourri.sbmi57 import model, hyper, meta  # SGD
meta

{'id': 'sbmi57',
 'name': 'Logistic Lasso',
 'descriptions': 'Logistic Regression with L1 penalty (Lasso)',
 'solver': 'stochastic gradient descent',
 'active': True,
 'keywords': ['binary classification', 'linear regression'],
 'output_num': 'single',
 'output_scale': 'binary',
 'output_dtype': 'bool',
 'input_num': 'multi',
 'input_scale': 'interval',
 'input_dtype': 'float'}

## Train

In [4]:
%%time
rscv = RandomizedSearchCV(**{'estimator': model, 'param_distributions': hyper}, **cv_settings)
rscv.fit(X = X_train, y = Y_train)  # Run CV

bestparam, summary = select_the_best(rscv)  # find the "best" parameters
bestmodel = refit_model(model, bestparam, X_train, Y_train)  # Refit the "best" model



CPU times: user 950 ms, sys: 133 ms, total: 1.08 s
Wall time: 12.8 s


## Evaluate

In [5]:
print("Infer/predict on validation set")
Y_pred = bestmodel.predict(X_valid)

print("\nOut of sample score")
print(scorerfun(Y_valid, Y_pred))

print("\nOut of sample score (Other metrics)")
print_scores(Y_pred, Y_valid)

print("\nBest model parameters")
print(bestparam)

print("\nIn-sample scores and model variants (from CV)")
summary

Infer/predict on validation set

Out of sample score
0.9497354497354498

Out of sample score (Other metrics)
            Matthews: 0.949735
           Accurancy: 0.974868
             Jaccard: 0.976608
             Hamming: 0.023392
           Precision: 0.981481
              Recall: 0.981481

Best model parameters
{'lin__alpha': 0.006726403087930424}

In-sample scores and model variants (from CV)


Unnamed: 0,lin__alpha,cvratio,rank_test_score,mean_test_score,std_test_score,mean_fit_time,std_fit_time
32,0.006726,43.493146,2,0.936219,0.021526,0.098092,0.006162
42,0.003499,43.493146,2,0.936219,0.021526,0.082895,0.003551
6,0.005984,43.493146,2,0.936219,0.021526,0.135411,0.026509
29,0.004756,43.493146,2,0.936219,0.021526,0.089709,0.008966
10,0.00208,23.960689,1,0.941487,0.039293,0.086847,0.0058


## Parameters

In [6]:
bestmodel.steps[1][1].coef_

array([[-0.28717298, -0.29776681,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -1.51523429,  0.        ,  0.2093156 ,
        -1.59999204,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.33102573,
        -1.66927594, -0.93170008, -0.4651756 ,  0.        , -0.62009103,
         0.        , -0.74006564, -0.08149224, -0.45102008,  0.        ]])

## Debug, Memory, Misc

In [7]:
#del summary
#locals()
%whos

Variable             Type                  Data/Info
----------------------------------------------------
RandomizedSearchCV   ABCMeta               <class 'sklearn.model_sel<...>arch.RandomizedSearchCV'>
X_train              ndarray               398x30: 11940 elems, type `float64`, 95520 bytes
X_valid              ndarray               171x30: 5130 elems, type `float64`, 41040 bytes
Y_pred               ndarray               171: 171 elems, type `int64`, 1368 bytes
Y_train              ndarray               398: 398 elems, type `int64`, 3184 bytes
Y_valid              ndarray               171: 171 elems, type `int64`, 1368 bytes
bestmodel            Pipeline              Pipeline(memory=None,\n  <...>se=0, warm_start=True))])
bestparam            dict                  n=1
cv_settings          dict                  n=6
fold_ids             ndarray               398: 398 elems, type `int64`, 3184 bytes
hyper                dict                  n=1
meta                 dict           