In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from util import getData_tmp
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.ensemble import (BaggingClassifier,ExtraTreesClassifier, RandomForestClassifier,
                              AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score  # finding scores from different classifiers

from sklearn.tree import DecisionTreeClassifier  # bagging

def main():
    X, Y = getData_tmp()  # X is image, Y is labels
    X, Y = shuffle(X, Y)
    N, D = X.shape
    Ntrain = int(N * 0.8)
    X_Train, Y_Train = X[:Ntrain], Y[:Ntrain]  # sets training set
    X_Test, Y_Test = X[Ntrain:], Y[Ntrain:]  # test set

    # feature reduction PCA w/o reducing dimensionality that computes min number of dimensions req to preserve 95% of training set variance
    pca = PCA(n_components=0.95)
    pca.fit(X_Train)
    xtrain_pca = pca.transform(X_Train)  # changes size of xtrain
    xtest_pca = pca.transform(X_Test)

    class EstimatorSelectionHelper:
        def __init__(self, models, params):
            if not set(models.keys()).issubset(set(params.keys())):
                missing_params = list(set(models.keys()) - set(params.keys()))
                raise ValueError("Some estimators are missing parameters: %s" % missing_params)
            self.models = models
            self.params = params
            self.keys = models.keys()
            self.grid_searches = {}

        def fit(self, x, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
            for key in self.keys:
                print("Running GridSearchCV for %s." % key)
                model = self.models[key]
                params = self.params[key]
                gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                                  verbose=verbose, scoring=scoring, refit=refit)
                gs.fit(x, y)
                self.grid_searches[key] = gs

        def score_summary(self, sort_by='mean_score'):
            def row(key, scores, params):
                d = {
                    'estimator': key,
                    'min_score': min(scores),
                    'max_score': max(scores),
                    'mean_score': np.mean(scores),
                    'std_score': np.std(scores),
                }
                return pd.Series({**params, **d})  # py3

                # return pd.Series(dict(params.items() + d.items())) #py2.7

            rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
                    for k in self.keys
                    for gsc in self.grid_searches[k].grid_scores_]
            df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

            columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
            columns = columns + [c for c in df.columns if c not in columns]

            return df[columns]

            # using it on classification

    models1 = {
        'LogisticRegression': LogisticRegression(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'SVC': SVC(class_weight='balanced'),
        'BaggingClassifier': BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=800,
                                               bootstrap=True),
    }

    params1 = {
        'LogisticRegression':{'random_state':(1,2)},
        'ExtraTreesClassifier': {'n_estimators':(10, 500)},  # n_estimators: number of rounds/trees, 8 or 16 rounds
        'RandomForestClassifier': {'n_estimators': (10, 500), 'random_state': [1], 'criterion': ['entropy']},
        'AdaBoostClassifier': {'n_estimators': (10, 500)},
        'GradientBoostingClassifier': {'n_estimators': (10, 500), 'learning_rate': [0.08, 1.0]},
        'SVC': [
            {'kernel': ['linear'], 'C': [1, 10]},
            {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        ],
        'BaggingClassifier': {'n_estimators': (10, 500)},

    }
    helper1 = EstimatorSelectionHelper(models1, params1)
    helper1.fit(xtrain_pca, Y_Train, scoring='f1', n_jobs=-1, refit=True)
    best_scores = {}

    for key in helper1.keys:
        best_scores[key] = helper1.grid_searches[key].best_params_

    print(best_scores)
    print(helper1.score_summary(sort_by='min_score'))



if __name__ == '__main__':
    main()

Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
