## Finding a best model using GridSearchCV for Digitd dataset


In [20]:
import sklearn.datasets as ds
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier


In [65]:
X, y = ds.load_digits(return_X_y = True)


## **GridSearchCV**

**GridSearchCV on one model**

Tunes the model with taking all possible permutaions and combinations of the given hyperparams.
DrawBack: Time consuming and costly in case of big datasets. --> solution: **RandomizedSearchCV**

In [23]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(X, y)
clf.cv_results_
df_cv = pd.DataFrame(clf.cv_results_)
df_cv



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.731197,0.071178,0.092975,0.003901,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.434066,0.40884,0.415042,0.487395,0.498592,0.448525,0.037134,6
1,0.056646,0.008415,0.013965,0.002605,1,linear,"{'C': 1, 'kernel': 'linear'}",0.964286,0.922652,0.966574,0.963585,0.929577,0.94936,0.019119,1
2,0.642705,0.07211,0.076764,0.01595,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.46978,0.433702,0.442897,0.507003,0.512676,0.473011,0.03217,4
3,0.062623,0.018825,0.016167,0.00414,10,linear,"{'C': 10, 'kernel': 'linear'}",0.964286,0.922652,0.966574,0.963585,0.929577,0.94936,0.019119,1
4,0.642273,0.093856,0.075583,0.012548,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.46978,0.433702,0.442897,0.507003,0.512676,0.473011,0.03217,4
5,0.068822,0.010148,0.018764,0.00171,20,linear,"{'C': 20, 'kernel': 'linear'}",0.964286,0.922652,0.966574,0.963585,0.929577,0.94936,0.019119,1


In [25]:
df_cv[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.448525
1,1,linear,0.94936
2,10,rbf,0.473011
3,10,linear,0.94936
4,20,rbf,0.473011
5,20,linear,0.94936


In [26]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [27]:
clf.best_score_

0.9493600445186422

**GridSearchCV on many models**

In [62]:
##Creating HyperParameter dictionary

param = {
    'logistic_regression' : {
        'model': LogisticRegression(solver = 'liblinear', multi_class = 'auto'),
        'params': {'C' : [1,5,10]}
    },
    
    'svc': {
        'model': svm.SVC(gamma = 'auto'),
        'params': {'C' : [1,5,10], 'kernel': ['rbf','linear'] }
    },
    
    'decisionTreeClassification' : {
        'model' : DecisionTreeClassifier(random_state = 0),
        'params' : {'criterion' : ['gini', 'entropy']}
    },
    
    'randomForestClassification': {
        'model' : RandomForestClassifier(random_state = 0),
        'params' : { 'n_estimators' : [100,200,300]}
    },
    
    'gaussianNB' : {
        'model' : GaussianNB(),
        'params' : {}
    },
    
    'mulitnomailNB' : {
        'model' : MultinomialNB(),
        'params' : {}
    }
}

scores = []

for model_name, mp in param.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df 



Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.920979,{'C': 1}
1,svc,0.94936,"{'C': 1, 'kernel': 'linear'}"
2,decisionTreeClassification,0.812465,{'criterion': 'entropy'}
3,randomForestClassification,0.937117,{'n_estimators': 200}
4,gaussianNB,0.806344,{}
5,mulitnomailNB,0.871452,{}


From the above table, I conclude that SVC model with params--> C = 1, and Kernel = linear is the best model than all others.

## **RandomizedSearchCV** 


Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [63]:
from sklearn.model_selection import RandomizedSearchCV
for model_name, mp in param.items():
    clf_rcv =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, n_iter = 2, return_train_score=False)
    clf_rcv.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf_rcv.best_score_,
        'best_params': clf_rcv.best_params_
    })
    
df_rcv = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_rcv 



Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.920979,{'C': 1}
1,svc,0.94936,"{'C': 1, 'kernel': 'linear'}"
2,decisionTreeClassification,0.812465,{'criterion': 'entropy'}
3,randomForestClassification,0.937117,{'n_estimators': 200}
4,gaussianNB,0.806344,{}
5,mulitnomailNB,0.871452,{}
6,logistic_regression,0.918753,{'C': 5}
7,svc,0.473011,"{'kernel': 'rbf', 'C': 5}"
8,decisionTreeClassification,0.812465,{'criterion': 'entropy'}
9,randomForestClassification,0.937117,{'n_estimators': 200}
