In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from util import getData_tmp
from sklearn.utils import shuffle


def main():
    # get data
    X, Y = getData_tmp()  # X is image, Y is labels
    X, Y = shuffle(X, Y)
    N, D = X.shape
    Ntrain = int(N * 0.8)
    X_Train, Y_Train = X[:Ntrain], Y[:Ntrain]  # sets training set
    X_Valid, Y_Valid = X[Ntrain:], Y[Ntrain:]  # validation

    class EstimatorSelectionHelper:
        def __init__(self, models, params):
            if not set(models.keys()).issubset(set(params.keys())):
                missing_params = list(set(models.keys()) - set(params.keys()))
                raise ValueError("Some estimators are missing parameters: %s" % missing_params)
            self.models = models
            self.params = params
            self.keys = models.keys()
            self.grid_searches = {}

        def fit(self, X_Train, Y_Train, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
            for key in self.keys:
                print("Running GridSearchCV for %s." % key)
                model = self.models[key]
                params = self.params[key]
                gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                                  verbose=verbose, scoring=scoring, refit=refit)
                gs.fit(X_Train, Y_Train)
                self.grid_searches[key] = gs

        def score_summary(self, sort_by='mean_score'):
            def row(key, scores, params):
                d = {
                    'estimator': key,
                    'min_score': min(scores),
                    'max_score': max(scores),
                    'mean_score': np.mean(scores),
                    'std_score': np.std(scores),
                }
                return pd.Series({**params,**d}) #py3

                #return pd.Series(dict(params.items() + d.items())) #py2.7

            rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
                    for k in self.keys
                    for gsc in self.grid_searches[k].grid_scores_]
            df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

            columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
            columns = columns + [c for c in df.columns if c not in columns]

            return df[columns]
#using it on classification
    from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
                                  AdaBoostClassifier, GradientBoostingClassifier)
    from sklearn.svm import SVC

    models1 = {
        'ExtraTreesClassifier': ExtraTreesClassifier(class_weight='balanced'),
        'RandomForestClassifier': RandomForestClassifier(class_weight='balanced'),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'SVC': SVC(class_weight='balanced')
    }

    params1 = {
        'ExtraTreesClassifier': {'n_estimators': [16, 32]}, #n_estimators: number of rounds/trees
        'RandomForestClassifier': {'n_estimators': [16, 32]},
        'AdaBoostClassifier': {'n_estimators': [16, 32]},
        'GradientBoostingClassifier': {'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0]},
        'SVC': [
            {'kernel': ['linear'], 'C': [1, 10]},
            {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        ]
    }

    helper1 = EstimatorSelectionHelper(models1, params1)
    helper1.fit(X_Train, Y_Train, scoring='f1', n_jobs=-1)

    print(helper1.score_summary(sort_by='min_score'))


if __name__ == '__main__':
    main()


Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.7s finished


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    3.4s finished


Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   13.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   13.3s finished


Running GridSearchCV for SVC.
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   27.0s finished


                     estimator min_score mean_score max_score    std_score  \
1         ExtraTreesClassifier  0.809717   0.822244  0.843501     0.015111   
3       RandomForestClassifier  0.807065   0.822372  0.845745    0.0167894   
0         ExtraTreesClassifier  0.804911   0.830614  0.850667    0.0191021   
2       RandomForestClassifier  0.802244   0.815268  0.835341     0.014402   
14                         SVC  0.791284   0.801061  0.806704   0.00694052   
12                         SVC  0.787472   0.790673   0.79496   0.00315179   
10                         SVC  0.782708   0.787987  0.796892   0.00633326   
15                         SVC  0.782609    0.78861  0.794489   0.00485092   
6   GradientBoostingClassifier  0.779221   0.804626  0.835165    0.0231257   
7   GradientBoostingClassifier  0.776623   0.804494  0.826733    0.0208411   
8   GradientBoostingClassifier  0.774194   0.793568  0.824691    0.0222268   
9   GradientBoostingClassifier  0.773562   0.793936  0.816525   

[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   51.7s finished
