In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from pprint import pprint

In [3]:
import initdirs

from pdata.ml import run_gscv
from pdata.ml import gather_best_hyperparams
from pdata.ml import gather_best_scores_for_all_gscv
from pdata.ml import get_model_name
from pdata.ml import cross_val_for_estimators
from pdata.ml import MLData

In [4]:
X, y = load_wine(return_X_y=True, as_frame=True)

In [5]:
mldata = MLData(X)

In [6]:
X_shuffled, y_shuffled = shuffle(mldata.X_scaled, y, random_state=42)

In [7]:
setup = dict()

setup['random_forest'] = {
    'estimator': RandomForestClassifier(),
    'param_grid': {
        'n_estimators': [5, 10, 20], 
        'min_samples_split': (2, 4, 8, 16, 32),
        'min_samples_leaf': (1, 2, 4, 8),
        'criterion': ['gini', 'entropy'],
        'random_state': [1, 4, 42, 100],
    }
}

setup['adaboost'] = {
    'estimator': AdaBoostClassifier(),
    'param_grid': {
        'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)],
        'n_estimators': [10, 25, 50],
        'learning_rate': [0.5, 1.],
        'algorithm': ['SAMME', 'SAMME.R'],
        'random_state': [1, 4, 42, 100],
    }
}

setup['svm'] = {
    'estimator': SVC(),
    'param_grid': {
        'C': [0.5, 1., 1.5, 2.],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
        'random_state': [1, 4, 42, 100],
    }
}

In [8]:
pprint(setup)

{'adaboost': {'estimator': AdaBoostClassifier(),
              'param_grid': {'algorithm': ['SAMME', 'SAMME.R'],
                             'base_estimator': [DecisionTreeClassifier(max_depth=1),
                                                DecisionTreeClassifier(max_depth=2)],
                             'learning_rate': [0.5, 1.0],
                             'n_estimators': [10, 25, 50],
                             'random_state': [1, 4, 42, 100]}},
 'random_forest': {'estimator': RandomForestClassifier(),
                   'param_grid': {'criterion': ['gini', 'entropy'],
                                  'min_samples_leaf': (1, 2, 4, 8),
                                  'min_samples_split': (2, 4, 8, 16, 32),
                                  'n_estimators': [5, 10, 20],
                                  'random_state': [1, 4, 42, 100]}},
 'svm': {'estimator': SVC(),
         'param_grid': {'C': [0.5, 1.0, 1.5, 2.0],
                        'gamma': ['scale', 'auto'],
   

In [9]:
get_model_name(setup['random_forest']['estimator'])

'RandomForestClassifier'

In [10]:
estimators = [entry['estimator'] for entry in setup.values()]
estimators

[RandomForestClassifier(), AdaBoostClassifier(), SVC()]

In [11]:
cross_val_for_estimators(estimators, X_shuffled, y_shuffled)

Unnamed: 0,fold_1,fold_2,fold_3,fold_4,fold_5,cv_score_mean
0,1.0,1.0,0.944444,0.971429,1.0,0.983175
1,0.944444,0.888889,0.666667,0.914286,0.971429,0.877143
2,1.0,1.0,0.972222,0.971429,0.971429,0.983016


In [12]:
gscv_res = run_gscv(X_shuffled, y_shuffled, setup, score_func=None)

In [13]:
gscv_res

{'random_forest': GridSearchCV(cv=4, estimator=RandomForestClassifier(),
              param_grid={'criterion': ['gini', 'entropy'],
                          'min_samples_leaf': (1, 2, 4, 8),
                          'min_samples_split': (2, 4, 8, 16, 32),
                          'n_estimators': [5, 10, 20],
                          'random_state': [1, 4, 42, 100]}),
 'adaboost': GridSearchCV(cv=4, estimator=AdaBoostClassifier(),
              param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                          'base_estimator': [DecisionTreeClassifier(max_depth=1),
                                             DecisionTreeClassifier(max_depth=2)],
                          'learning_rate': [0.5, 1.0],
                          'n_estimators': [10, 25, 50],
                          'random_state': [1, 4, 42, 100]}),
 'svm': GridSearchCV(cv=4, estimator=SVC(),
              param_grid={'C': [0.5, 1.0, 1.5, 2.0], 'gamma': ['scale', 'auto'],
                          'kernel': ['

In [14]:
gather_best_hyperparams(gscv_res['adaboost'])

Unnamed: 0,algorithm,base_estimator,learning_rate,n_estimators,random_state
0,SAMME.R,DecisionTreeClassifier(max_depth=2),1.0,10,1


In [15]:
gather_best_scores_for_all_gscv(gscv_res)

Unnamed: 0,best_score,n_with_best,grid_size,share_with_best
random_forest,0.994318,1,480,0.002083
adaboost,0.983207,1,96,0.010417
svm,0.983081,16,128,0.125


In [16]:
mldata.summarize_importances(
    gscv_res['random_forest'].best_estimator_.feature_importances_
)

flavanoids                      0.201769
color_intensity                 0.185608
proline                         0.145691
hue                             0.105495
od280/od315_of_diluted_wines    0.100632
alcohol                         0.088317
magnesium                       0.039043
total_phenols                   0.038028
malic_acid                      0.035012
proanthocyanins                 0.024783
alcalinity_of_ash               0.022351
ash                             0.009788
nonflavanoid_phenols            0.003484
dtype: float64