In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from util import getData_tmp
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
                              AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.svm import SVC


def main():
    X, Y = getData_tmp()  # X is image, Y is labels
    X, Y = shuffle(X, Y)
    N, D = X.shape
    Ntrain = int(N * 0.8)
    X_Train, Y_Train = X[:Ntrain], Y[:Ntrain]  # sets training set
    X_Test, Y_Test = X[Ntrain:], Y[Ntrain:]  # test set

    # feature reduction PCA w/o reducing dimensionality that computes min number of dimensions req to preserve 95% of training set variance
    pca = PCA(n_components=0.95)
    pca.fit(X_Train)
    xtrain_pca = pca.transform(X_Train)  # changes size of xtrain
    xtest_pca = pca.transform(X_Test)

    class EstimatorSelectionHelper:
        def __init__(self, models, params):
            if not set(models.keys()).issubset(set(params.keys())):
                missing_params = list(set(models.keys()) - set(params.keys()))
                raise ValueError("Some estimators are missing parameters: %s" % missing_params)
            self.models = models
            self.params = params
            self.keys = models.keys()
            self.grid_searches = {}

        def fit(self, x, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
            for key in self.keys:
                print("Running GridSearchCV for %s." % key)
                model = self.models[key]
                params = self.params[key]
                gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                                  verbose=verbose, scoring=scoring, refit=refit)
                gs.fit(x, y)
                self.grid_searches[key] = gs

        def score_summary(self, sort_by='mean_score'):
            def row(key, scores, params):
                d = {
                    'estimator': key,
                    'min_score': min(scores),
                    'max_score': max(scores),
                    'mean_score': np.mean(scores),
                    'std_score': np.std(scores),
                }
                return pd.Series({**params, **d})  # py3

                # return pd.Series(dict(params.items() + d.items())) #py2.7

            rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
                    for k in self.keys
                    for gsc in self.grid_searches[k].grid_scores_]
            df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

            columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
            columns = columns + [c for c in df.columns if c not in columns]

            return df[columns]

            # using it on classification

    models1 = {
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'SVC': SVC(class_weight='balanced')
    }

    params1 = {
        'ExtraTreesClassifier': {'n_estimators': [16, 32]},
        'RandomForestClassifier': {'n_estimators': [16, 32]},
        'AdaBoostClassifier': {'n_estimators': [16, 32]},
        'GradientBoostingClassifier': {'n_estimators': [16, 32]},
        'SVC': [
            {'kernel': ['linear'], 'C': [1, 10]},
            {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        ]
    }

    helper1 = EstimatorSelectionHelper(models1, params1)
    helper1.fit(xtrain_pca, Y_Train, scoring='f1', n_jobs=-1)

    print(helper1.score_summary(sort_by='min_score'))


if __name__ == '__main__':
    main()


Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.4s finished


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.9s finished


Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.4s finished


Running GridSearchCV for SVC.
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.4s finished


                     estimator min_score mean_score max_score   std_score  \
7   GradientBoostingClassifier  0.785425   0.811067  0.836551   0.0208723   
12                         SVC  0.781503   0.793233  0.806005   0.0100298   
10                         SVC  0.773393   0.787112  0.798176   0.0102899   
4           AdaBoostClassifier  0.772432   0.788067   0.80916   0.0154831   
6   GradientBoostingClassifier   0.77095    0.79649   0.83727    0.029141   
13                         SVC  0.768539   0.785321  0.795895   0.0119994   
9                          SVC  0.767962   0.784722  0.801394   0.0136485   
8                          SVC  0.766157    0.78581  0.804651   0.0157254   
5           AdaBoostClassifier   0.76326    0.79272  0.828897    0.027214   
1         ExtraTreesClassifier  0.746224   0.784862  0.817927    0.029537   
11                         SVC  0.735471   0.741887  0.753036  0.00791361   
0         ExtraTreesClassifier  0.733533   0.756345  0.799431   0.0304842   

[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    7.2s finished
