In [2]:

import numbers
import warnings
import sklearn
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import type_of_target
import numpy as np
import scipy.sparse as sp
from joblib import Parallel, delayed
from sklearn.base import clone, is_classifier
from sklearn.model_selection import KFold, StratifiedKFold, check_cv, GridSearchCV, BaseCrossValidator, RandomizedSearchCV
# TODO: conisder working around relying on sklearn implementation details
from sklearn.model_selection._validation import (_check_is_permutation,
                                                 _fit_and_predict)
from sklearn.exceptions import FitFailedWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import indexable, check_random_state
from sklearn.utils.validation import _num_samples
from model_selection_utils import *

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection._search import BaseSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load data and create a model
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
class SearchEstimatorList(BaseEstimator):
    def __init__(self, estimator_list = ['linear', 'forest'], param_grid_list = 'auto', is_discrete=False, scoring=None,
                 n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                 error_score=np.nan, return_train_score=False):

        self.estimator_list = get_complete_estimator_list(estimator_list, 'discrete' if is_discrete else 'continuous')

        if param_grid_list == 'auto':
            self.param_grid_list = auto_hyperparameters(estimator_list=self.estimator_list, is_discrete=is_discrete)
        elif (param_grid_list == None) and (param_grid_list == 'default'):
            self.param_grid_list = len(estimator_list) * [{}]
        else:
            self.param_grid_list = param_grid_list
        # self.categorical_indices = categorical_indices
        if scoring == None:
            if is_discrete:
                self.scoring = 'f1'
            else:
                self.scoring = 'mse'
            warnings.warn(f"No scoring value was given. Using default score method {self.scoring}.")
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.cv = cv
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch
        self.error_score = error_score
        self.return_train_score = return_train_score
        return

    def select(self, X, y, *, scaling=True, sample_weight=None, groups=None):
        """
        Perform cross-validation on the estimator list.
        """
        self._search_list = []
        self.scaling = scaling
        if scaling:
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            self.scaler = StandardScaler()
            self.scaler.fit(X)
            scaled_X = self.scaler.transform(X)

        for estimator, param_grid in zip(self.estimator_list, self.param_grid_list):
            try:
                temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring,
                                       n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose,
                                       pre_dispatch=self.pre_dispatch, error_score=self.error_score,
                                       return_train_score=self.return_train_score)
                if scaling: # is_linear_model(estimator) and
                    temp_search.fit(scaled_X, y, groups=groups) # , groups=groups, sample_weight=sample_weight
                    self._search_list.append(temp_search)
                else:
                    temp_search.fit(X, y,  groups=groups)
                    self._search_list.append(temp_search)
            except (ValueError, TypeError, FitFailedWarning) as e:
                # Raise a warning for the failed initialization
                warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}"
                warnings.warn(warning_msg, category=UserWarning)
        self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
        self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_
        self.best_score_ = self._search_list[self.best_ind_].best_score_
        self.best_params_ = self._search_list[self.best_ind_].best_params_
        return self
    
    def scaler_transform(self, X):
        if self.scaling:    
            return self.scaler.transform(X)
        
    def best_model(self):
        return self.best_estimator_
    
    def predict(self, X):
        if self.scaling:    
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            return self.best_estimator_.predict(self.scaler.transform(X))
        return self.best_estimator_.predict(X)
    
    def predict_prob(self, X):
        if self.scaling:    
            if is_data_scaled(X):
                warnings.warn("Data may already be scaled. Scaling twice may negatively affect results.", UserWarning)
            return self.best_estimator_.predict(self.scaler.transform(X))
        return self.best_estimator_.predict_proba(X)

## Testing string inputs

In [4]:
search = SearchEstimatorList(estimator_list = 'linear', is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)





LogisticRegressionCV(Cs=1, solver='liblinear')
{'Cs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy of test dataset: 0.8
Accuracy of test dataset: [1.         0.5        0.78571429]
(30,)


110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

In [5]:
search = SearchEstimatorList(estimator_list = 'poly', is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Pipeline(steps=[('poly', PolynomialFeatures()),
                ('linear', LogisticRegressionCV(solver='saga'))])
{'linear__Cs': 10, 'linear__max_iter': 100, 'linear__penalty': 'l2', 'linear__solver': 'saga', 'poly__degree': 2}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)




In [5]:
search = SearchEstimatorList(estimator_list = 'gbf', is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)





GradientBoostingClassifier(learning_rate=0.01)
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)


In [7]:
search = SearchEstimatorList(estimator_list = 'nnet', is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc) 
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)





MLPClassifier(alpha=0.001)
{'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)




In [6]:
search = SearchEstimatorList(estimator_list = ['linear', 'forest'], is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc) 
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

RandomForestClassifier()
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)


In [7]:
search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc) 
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

RandomForestClassifier(n_estimators=500)
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)




In [None]:
search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc) 
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

RandomForestClassifier(n_estimators=500)
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30,)




## Testing Model Objects

In [11]:
search = SearchEstimatorList(estimator_list = LogisticRegression(), is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



LogisticRegression()
{}
Accuracy of test dataset: 1.0
Accuracy of test dataset: [1. 1. 1.]
(30, 3)




In [12]:
search = SearchEstimatorList(estimator_list = LogisticRegressionCV(), is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)





LogisticRegressionCV(Cs=1, solver='liblinear')
{'Cs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy of test dataset: 0.8
Accuracy of test dataset: [1.         0.5        0.78571429]
(30, 3)


110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none

In [5]:
search = SearchEstimatorList(estimator_list = LogisticRegressionCV(), is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred,average=None)
print("Accuracy of test dataset:", acc)
print("Accuracy of test dataset:", f1,)
print(search.predict_prob(X_test).shape)



{'linear__Cs': 10,
 'linear__penalty': 'l2',
 'linear__solver': 'saga',
 'poly__degree': 2}

## Edge Cases

In [3]:
search = SearchEstimatorList(estimator_list = [], is_discrete=True)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
y_pred = search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print(search.predict_prob(X_test))


ValueError: The list is empty

In [8]:
search = SearchEstimatorList(estimator_list='linear', is_discrete=True)
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X = scaler.transform(X_train)
search.select(X_train, y_train)
print(search.best_model())
print(search.best_params_)
scaled_x_test = scaler.transform(X_test)
y_pred = search.predict(scaled_x_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)



LogisticRegressionCV(Cs=1, solver='liblinear')
{'Cs': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.43333333333333335


110 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1672, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anthonycampbell/miniforge3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none