# Compare ALO Optimization to Grid Search

*This notebook compares ALO to grid search on a variety of real-world data sets.*

## Import Dependencies

In [1]:
import peak_engines
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
import warnings; warnings.simplefilter('ignore')

## Utility Functions for Data Processing

In [2]:
def to_indicator(df, name, drop_original=True):
    value = df[name]
    unique_value = set(value)
    for i, val in enumerate(unique_value):
        df[name + "_" + str(i)] = (value == val) + 0.0
    if drop_original:
        df.drop(labels=name, axis=1, inplace=True)

## Compute LO Via Brute Force

In [3]:
def compute_lo(X, y, C):
    model = LogisticRegression(C=C)
    result = 0
    values = sorted(list(set(y)))
    for train_indexes, test_indexes in LeaveOneOut().split(X):
        X_train = X[train_indexes]
        X_test = X[test_indexes]
        y_train = y[train_indexes]
        y_test = y[test_indexes]
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_test)[0]
        index = values.index(y_test[0])
        result += np.log(pred[index])
    return result / len(y)

## Fit Models

In [4]:
def fit_models(X, y):
    # grid search with default parameters
    model_grid = LogisticRegressionCV(random_state=0, scoring='neg_log_loss')
    model_grid.fit(X, y)
    C_grid = model_grid.C_[0]
    lambda_grid = 1 / (2*C_grid)
    print("grid", lambda_grid, compute_lo(X, y, C_grid))
    
    # ALO optimization
    model_alo = peak_engines.LogisticRegressionModel()
    model_alo.fit(X, y)
    C_alo = model_alo.C_[0]
    lambda_alo = 1 / (2*C_alo)
    print("alo", lambda_alo, compute_lo(X, y, C_alo))

## Breast Cancer Dataset

In [5]:
X, y = load_breast_cancer(return_X_y=True)
X = StandardScaler().fit_transform(X)
fit_models(X, y)

grid 1.391279701103563 -0.07704148045383749
alo 0.7512990318459845 -0.07490235610619664


## Cleveland Heart

In [6]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df = pd.read_csv(url, header=None, na_values="?")
df = df.dropna()
df.columns = [
        'age',
        'sex',
        'cp',
        'trestbps',
        'chol',
        'fbs',
        'restecg',
        'thalach',
        'exang',
        'oldpeak',
        'slope',
        'ca',
        'thal',
        'num',
]
to_indicator(df, 'cp')
to_indicator(df, 'restecg')
to_indicator(df, 'slope')
to_indicator(df, 'thal')
y = np.array(df['num'])
df = df.drop(columns=["num"])
X = np.array(df.iloc[:,:].values, dtype=float)
y = y > 0
y = 2*y - 1
X = StandardScaler().fit_transform(X)

In [7]:
fit_models(X, y)

grid 1.391279701103563 -0.3846047408578833
alo 4.785417311146802 -0.37907062783574563


## Arcene

In [8]:
url_X_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data'
url_y_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels'
url_X_valid = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_valid.data'
url_y_valid = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/arcene_valid.labels'
X_train = pd.read_csv(url_X_train, delim_whitespace=True, header=None).iloc[:,:].values
y_train = pd.read_csv(url_y_train, delim_whitespace=True, header=None).iloc[:,0].values
X_valid = pd.read_csv(url_X_valid, delim_whitespace=True, header=None).iloc[:,:].values
y_valid = pd.read_csv(url_y_valid, delim_whitespace=True, header=None).iloc[:,0].values
X = np.vstack((X_train, X_valid))
y = np.concatenate((y_train, y_valid))
X = StandardScaler().fit_transform(X)

In [9]:
fit_models(X, y)

grid 10.772173450159421 -0.3035087160885871
alo 7.229829676854908 -0.30324111580909746
