In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import ShuffleSplit
import numpy as np
from pprint import pprint
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.pipeline import Pipeline
# from estimator_selection import EstimatorSelectionHelper
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB
from sklearn.neural_network import MLPClassifier
import time
from itertools import product
import pickle
from sklearn.metrics import *
from sklearn.metrics.cluster import contingency_matrix
import seaborn as sns
import os

In [20]:
from sklearn.datasets import fetch_openml, load_iris
dataset = load_iris()
X = dataset['data']
y = dataset['target']

In [22]:
class EstimatorSelectionHelper:
    def __init__(self, models):
        self.models = models
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=None, n_jobs=4, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key][0]
            params = self.models[key][1]
            # gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
            #                   verbose=verbose, scoring=scoring, refit=refit,
            #                   return_train_score=True)
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params, times):
            d = {
                 'estimator': key,
                 # 'min_score': min(scores),
                 # 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'mean_fit_time': np.mean(times),
                 # 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            times = []
            cv = self.grid_searches[k].cv
            if type(self.grid_searches[k].cv) != int:
                r = self.grid_searches[k].cv_results_
                scores.append(r['mean_test_score'])
                times.append(r['mean_fit_time'])

            else:
                for i in range(cv):
                    key = "split{}_test_score".format(i)
                    r = self.grid_searches[k].cv_results_[key]
                    scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            all_times = np.hstack(times)
            for p, s, t in zip(params,all_scores, all_times):
                rows.append((row(k, s, p, t)))

        df = pd.concat(rows, axis=1, sort=True).T.sort_values([sort_by], ascending=False)

        # columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = ['estimator', 'mean_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [23]:
models = {
    'LogisticRegression': (LogisticRegression(), 
                           [{'penalty': ['l1'], 'C': [0.1, 1.0, 5], 'solver': ['liblinear']},
                            {'penalty': ['l2'], 'C': [0.1, 1.0, 5], 'solver': ['lbfgs']}]),
    'ExtraTreesClassifier': (ExtraTreesClassifier(), 
                             {'n_estimators': [16, 32, 50] }),
    'RandomForestClassifier': (RandomForestClassifier(), 
                               {'n_estimators': [16, 32, 50] }),
    'AdaBoostClassifier': (AdaBoostClassifier(), 
                           {'n_estimators': [16, 32] }),
    'GradientBoostingClassifier': (GradientBoostingClassifier(), 
                                   {'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0]}),
    'SVC': (SVC(), [
            {'kernel': ['linear'], 'C': [1, 10]},
            {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        ]), 
    'MultinomialNB': (MultinomialNB(), 
                      {'alpha': [0, 0.5, 1.0]}),
    'MLPClassifier': (MLPClassifier(), {}),
    'CategoricalNB': (CategoricalNB(), {}),
    'GaussianNB': (GaussianNB(), {}),
    'SGDClassifier': (SGDClassifier(), {})
}

estimator_matrix = EstimatorSelectionHelper(models)
estimator_matrix.fit(X, y, scoring='f1', n_jobs=2, cv=ShuffleSplit(test_size=0.30, n_splits=1, random_state=0))

Running GridSearchCV for LogisticRegression.
Fitting 1 folds for each of 6 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    1.1s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.1s finished


Running GridSearchCV for ExtraTreesClassifier.
Fitting 1 folds for each of 3 candidates, totalling 3 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 1 folds for each of 3 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   4 out of   4 | elapsed:    0.0s finished


Running GridSearchCV for AdaBoostClassifier.
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Running GridSearchCV for GradientBoostingClassifier.
Fitting 1 folds for each of 4 candidates, totalling 4 fits
Running GridSearchCV for SVC.
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Running GridSearchCV for MultinomialNB.
Fitting 1 folds for each of 3 candidates, totalling 3 fits
Running GridSearchCV for MLPClassifier.
Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    0.0s finished
  'setting alpha = %.1e' % _ALPHA_MIN)
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished


Running GridSearchCV for CategoricalNB.
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Running GridSearchCV for GaussianNB.
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Running GridSearchCV for SGDClassifier.
Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.0s finished


In [25]:
#https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
results.style.applymap('green', subset=['mean_score'])
def make_pretty(styler):
    styler.set_caption("Results")
    styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="YlGnBu")
    return styler
results.style.pipe(make_pretty)


TypeError: the first argument must be callable

<pandas.io.formats.style.Styler at 0x239b44d79b0>

In [24]:
results = helper1.score_summary()
results

Unnamed: 0,estimator,mean_score,C,alpha,gamma,kernel,learning_rate,mean_fit_time,n_estimators,penalty,solver
29,GaussianNB,1.0,,,,,,0.0,,,
15,GradientBoostingClassifier,0.977778,,,,,0.8,0.0566134,32.0,,
27,MLPClassifier,0.977778,,,,,,0.120044,,,
19,SVC,0.977778,10.0,,,linear,,0.0160034,,,
18,SVC,0.977778,1.0,,,linear,,0.0160034,,,
17,GradientBoostingClassifier,0.977778,,,,,1.0,0.0320001,32.0,,
16,GradientBoostingClassifier,0.977778,,,,,1.0,0.0239997,16.0,,
14,GradientBoostingClassifier,0.977778,,,,,0.8,0.0406134,16.0,,
13,AdaBoostClassifier,0.977778,,,,,,0.0556173,32.0,,
12,AdaBoostClassifier,0.977778,,,,,,0.0556171,16.0,,
