## Finding best model and hyper parameter tunning using GridSearchCV
For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [31]:
from sklearn import svm, datasets
iris = datasets.load_iris
import numpy as np

In [52]:
import pandas as pd
X= pd.read_csv('C:/Users/sai kiran Reddy/Desktop/ml practical/Iris.csv')
X.head()
data=X.drop(['Species'],axis='columns')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2


In [53]:
X.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [54]:
y=X['Species']

##  Approach 1: Use train_test_split and manually tune parameters by trial and error

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

In [56]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

1.0

## Approach 2: Use K Fold Cross validation
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [26]:
from sklearn.model_selection import cross_val_score

In [57]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),data, y, cv=5)

array([0.66666667, 1.        , 1.        , 1.        , 0.7       ])

In [58]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),data, y, cv=5)


array([0.46666667, 0.73333333, 0.73333333, 0.73333333, 0.53333333])

In [59]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),data, y, cv=5)


array([0.46666667, 0.73333333, 0.73333333, 0.73333333, 0.53333333])

###  Above approach is tiresome and very manual. We can use for loop as an alternative



In [61]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),data, y, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.6399999999999999,
 'rbf_10': 0.6399999999999999,
 'rbf_20': 0.6399999999999999,
 'linear_1': 0.8733333333333333,
 'linear_10': 0.8733333333333333,
 'linear_20': 0.8733333333333333}

### From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

## Approach 3: Use GridSearchSV
### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [64]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(data, y)
clf.cv_results_

{'mean_fit_time': array([0.00511379, 0.00513749, 0.00777326, 0.00246625, 0.00574389,
        0.00690794]),
 'std_fit_time': array([0.00464412, 0.00489102, 0.00183966, 0.00219289, 0.00569268,
        0.00846743]),
 'mean_score_time': array([0.00319147, 0.00101662, 0.00210476, 0.00335674, 0.00074167,
        0.0003993 ]),
 'std_score_time': array([0.0039249 , 0.00112848, 0.00042269, 0.00573293, 0.00091475,
        0.00048905]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [65]:
df = pd.DataFrame(clf.cv_results_)
df


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005114,0.004644,0.003191,0.003925,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.466667,0.733333,0.733333,0.733333,0.533333,0.64,0.116237,4
1,0.005137,0.004891,0.001017,0.001128,1,linear,"{'C': 1, 'kernel': 'linear'}",0.666667,1.0,1.0,1.0,0.7,0.873333,0.155492,1
2,0.007773,0.00184,0.002105,0.000423,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.466667,0.733333,0.733333,0.733333,0.533333,0.64,0.116237,4
3,0.002466,0.002193,0.003357,0.005733,10,linear,"{'C': 10, 'kernel': 'linear'}",0.666667,1.0,1.0,1.0,0.7,0.873333,0.155492,1
4,0.005744,0.005693,0.000742,0.000915,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.466667,0.733333,0.733333,0.733333,0.533333,0.64,0.116237,4
5,0.006908,0.008467,0.000399,0.000489,20,linear,"{'C': 20, 'kernel': 'linear'}",0.666667,1.0,1.0,1.0,0.7,0.873333,0.155492,1


In [66]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [67]:

clf.best_score_

0.8733333333333333

In [68]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score',
 'scorer_',
 'scoring',
 '

In [69]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(data,y)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.64
1,1,rbf,0.64


##  How about different models with different hyperparameters?

In [70]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [71]:

scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(data,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.873333,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,1.0,{'n_estimators': 10}
2,logistic_regression,0.933333,{'C': 10}
