# Breast Cancer Cross-Validation
*This notebook compares the performance of different logistic regression models on a 10-fold cross-validation of the Breast Cancer dataset.*

## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import peak_engines
import sklearn
from collections import defaultdict
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
import time
import warnings
warnings.simplefilter('ignore')

## Prepare Dataset

In [2]:
X, y = load_breast_cancer(return_X_y=True)

## Compute Cross-Validation scores

In [3]:
def compute_score(cv, model):
    t1 = time.time()
    results = []
    for train_indexes, test_indexes in list(cv.split(X, y)):
        X_train, X_test = X[train_indexes], X[test_indexes]
        y_train, y_test = y[train_indexes], y[test_indexes]
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        results.append(-sklearn.metrics.log_loss(y_test, y_pred))
    delta = time.time() - t1
    return np.mean(results), delta

## Compute Performance Matrix

In [4]:
def compute_performance_matrix(models, cv):
    names = []
    matrix = defaultdict(list)
    for name, model in models.items():
        names.append(name)
        score, duration = compute_score(cv, model)
        matrix['Log Likelihood'].append(-score)
        matrix['Elapse (sec)'].append(duration)
    return pd.DataFrame(matrix, index=names)

## Compare different models

In [5]:
cv = KFold(n_splits=10, shuffle=True, random_state=0)
models = {
    'sklearn-cv': LogisticRegressionCV(scoring='neg_log_loss', random_state=0),
    'sklearn-cv-10': LogisticRegressionCV(scoring='neg_log_loss', cv=10, random_state=0),
    'aloocv-l2' : peak_engines.LogisticRegressionModel(),
    'aloocv-bridge' : peak_engines.LogisticRegressionModel(penalty='bridge')
}
compute_performance_matrix(models, cv)

Unnamed: 0,Log Likelihood,Elapse (sec)
sklearn-cv,0.087839,5.775733
sklearn-cv-10,0.078453,11.237246
aloocv-l2,0.078782,0.129117
aloocv-bridge,0.076862,0.163828
