In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [5]:
kernels = ['rbf', 'linear']
C = [1, 10, 20, 40]
avg_scores = {}

for kernel in kernels:
    for c in C:
        cv_scores = cross_val_score(SVC(C=c, kernel=kernel, gamma='auto'), iris.data, iris.target, cv=5)
        avg_scores[kernel + '_' + str(c)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'rbf_40': 0.96,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'linear_40': 0.96}

In [6]:
# if we have numerous parameters, we will have to run multiple nested for loops, nasty
# this is where we use gridsearchcv from sklearn

from sklearn.model_selection import GridSearchCV

# format: = GridSearchCV(model_with_parameters, {parameters to be tuned in an iterateable list})
clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1, 10, 20, 40]
    , 'kernel' : ['rbf', 'linear']
}, cv=5, return_train_score=False)

# gridsearchcv still used cross score validation under the hood

clf.fit(iris.data, iris.target)

In [7]:
clf.cv_results_

{'mean_fit_time': array([0.00259609, 0.00201588, 0.00208087, 0.00189815, 0.00232625,
        0.00305533, 0.0021771 , 0.00207329]),
 'std_fit_time': array([1.27492507e-04, 8.65237284e-05, 1.51060732e-04, 6.40073919e-05,
        2.68345253e-04, 1.70150140e-03, 1.99366785e-04, 3.26401217e-04]),
 'mean_score_time': array([0.00173583, 0.00148396, 0.00135932, 0.00133877, 0.00146465,
        0.00159116, 0.00139985, 0.00153484]),
 'std_score_time': array([2.94160205e-04, 2.68602308e-04, 3.85983701e-05, 1.25919991e-04,
        1.08722051e-04, 2.83382618e-04, 8.14667040e-05, 2.23458357e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20, 40, 40],
              mask=[False, False, False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtyp

In [8]:
df = pd.DataFrame(clf.cv_results_)

In [9]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002596,0.000127,0.001736,0.000294,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002016,8.7e-05,0.001484,0.000269,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002081,0.000151,0.001359,3.9e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001898,6.4e-05,0.001339,0.000126,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.002326,0.000268,0.001465,0.000109,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.003055,0.001702,0.001591,0.000283,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.002177,0.000199,0.0014,8.1e-05,40,rbf,"{'C': 40, 'kernel': 'rbf'}",1.0,0.966667,0.9,0.933333,1.0,0.96,0.038873,7
7,0.002073,0.000326,0.001535,0.000223,40,linear,"{'C': 40, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,7


In [11]:
specs = df[['param_C', 'param_kernel', 'mean_test_score']]
specs

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667
6,40,rbf,0.96
7,40,linear,0.96


In [12]:
clf.best_score_

0.9800000000000001

In [13]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [14]:
clf.best_estimator_

In [15]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_routed_params_for_fit',
 '_get_scorers',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run

In [16]:
# if we have a lot of parameter values, the computation cost will be very high, so we can run random parameter values

from sklearn.model_selection import RandomizedSearchCV

clf_ran = RandomizedSearchCV(SVC(gamma='auto'), {
        'C': [1, 10, 20, 40]
        , 'kernel' : ['rbf', 'linear']
    },
    cv=5,
    return_train_score=False,
    n_iter = 2 # will do only 2 iterations
    )

clf_ran.fit(iris.data, iris.target)

In [17]:
clf_ran.cv_results_

{'mean_fit_time': array([0.00305533, 0.00314875]),
 'std_fit_time': array([0.00097766, 0.00125115]),
 'mean_score_time': array([0.00251145, 0.00238371]),
 'std_score_time': array([0.00138482, 0.00101618]),
 'param_kernel': masked_array(data=['linear', 'linear'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[10, 20],
              mask=[False, False],
        fill_value=999999),
 'params': [{'kernel': 'linear', 'C': 10}, {'kernel': 'linear', 'C': 20}],
 'split0_test_score': array([1., 1.]),
 'split1_test_score': array([1., 1.]),
 'split2_test_score': array([0.9, 0.9]),
 'split3_test_score': array([0.96666667, 0.93333333]),
 'split4_test_score': array([1., 1.]),
 'mean_test_score': array([0.97333333, 0.96666667]),
 'std_test_score': array([0.03887301, 0.0421637 ]),
 'rank_test_score': array([1, 2], dtype=int32)}

In [18]:
rs_df = pd.DataFrame(clf_ran.cv_results_)
rs_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003055,0.000978,0.002511,0.001385,linear,10,"{'kernel': 'linear', 'C': 10}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,1
1,0.003149,0.001251,0.002384,0.001016,linear,20,"{'kernel': 'linear', 'C': 20}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,2


In [19]:
# this was about hyper parameter tuning, now let's go about choosing best model

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [23]:
model_params = {
    'svc': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20, 40],
            'kernel' : ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'C': [1, 5, 10]
        }
    },
    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {}
    }
}

In [24]:
scores = []

for model, params in model_params.items():
    clf = GridSearchCV(params['model'], params['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [25]:
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,model,best_score,best_params
0,svc,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}
3,gaussian_nb,0.953333,{}


# Digit Classification - GridSearch & Hyperparameters

In [26]:
from sklearn.datasets import load_digits
digits = load_digits()

In [27]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [35]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20, 40],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10, 40]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'C': [1, 5, 10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],

        }
    }
}

In [36]:
scores = []

for model, params in model_params.items():
    clf = GridSearchCV(params['model'], params['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [37]:
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.930458,{'n_estimators': 40}
2,logistic_regression,0.922114,{'C': 1}
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision_tree,0.81138,{'criterion': 'entropy'}
