# Finding best model and hyper parameter tunning using GridSearchCV 

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
digits=datasets.load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [3]:
digits.data.shape

(1797, 64)

###### Importing different models to be checked

In [4]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

###### Making a dictionary of models with  parameters to be observed

In [5]:
params_model={
    'svm':{
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        } 
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'gaussian_NB':{
        'model': GaussianNB(),
        'params':{
            'priors': [None],
            'var_smoothing': [0.00000001, 0.000000001, 0.00000001]
        }
    },
    'multinomial_NB':{
        'model': MultinomialNB(),
        'params':{
            'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)
        }
    },
    'decision_tree':{
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'ccp_alpha':[0,0.1,0.2,0.5,1.0]
        }
    }
}

In [6]:
x=pd.DataFrame(digits.data)
y=digits.target

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [8]:
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.875

###### Tiresome and manual approach of tuning

In [9]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),digits.data, digits.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.448545341999381,
 'rbf_10': 0.47636645001547506,
 'rbf_20': 0.47636645001547506,
 'linear_1': 0.9476973073351903,
 'linear_10': 0.9476973073351903,
 'linear_20': 0.9476973073351903}

# Approach 3: Use GridSearchCV
GridSearchCV does exactly same thing as for loop above but in a single line of code

In [10]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.718687,0.03023,0.17639,0.007564,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.411111,0.45,0.454039,0.448468,0.479109,0.448545,0.021761,6
1,0.065932,0.009691,0.016885,0.000496,1,linear,"{'C': 1, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
2,0.709198,0.025526,0.167439,0.007809,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
3,0.073162,0.01125,0.016871,0.003922,10,linear,"{'C': 10, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
4,0.73427,0.042622,0.169859,0.012928,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
5,0.07164,0.007604,0.016797,0.000657,20,linear,"{'C': 20, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1


# How about different models with different hyperparameters?

Grid search--- It is used to find the optimal hyperparameter of a model which results in the most accurate predictons

In [11]:
from sklearn.model_selection import GridSearchCV

###### Now using a for loop to iterate over different models

In [12]:
scores = []

for model_name, mp in params_model.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.89708,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
3,gaussian_NB,0.819723,"{'priors': None, 'var_smoothing': 1e-08}"
4,multinomial_NB,0.870907,{'alpha': 0.1}
5,decision_tree,0.811366,"{'ccp_alpha': 0, 'criterion': 'entropy'}"


### Based on above I can conclude that SVM model with C=1 and kernel='linear' is the best model for solving my classification problem of digits dataset.