# Hyper Parameter Tuning 

The process of finding the optimal parameters is called HyperTuning.

# Loading dataset

In [1]:
from sklearn import svm, datasets
import pandas as pd
import numpy as np
iris = datasets.load_iris()

In [2]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
df['flower_type'] = iris.target
df['flower_type'] = df['flower_type'].apply(lambda x: iris.target_names[x])

# Train Test Split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.3)

# Using SVM

In [5]:
model = svm.SVC(kernel='rbf', C=30, gamma='auto')
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.9555555555555556

Parameters are randomly set.

# K Fold Cross Validation

We need our model to be consistent. To find out the optimal parameters we use 'cross_val_score'

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
cross_val_score(svm.SVC(kernel='linear', C=10, gamma='auto'),iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [8]:
cross_val_score(svm.SVC(kernel='rbf', C=10, gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [9]:
cross_val_score(svm.SVC(kernel='rbf', C=20, gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

But the problem is how many times do we have to check like this. It is more reluctant and inefficient. L

We will do the exact same this but using a for loop

In [12]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_score = {}
for kvalue in kernels:
    for cvalue in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kvalue, C=cvalue, gamma='auto'),iris.data, iris.target, cv=5)
        avg_score[kvalue+'_'+str(cvalue)] = np.average(cv_scores)

avg_score

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

This way we can find the optimal parameters. But there is another issue here. SVC() methods have so many parameters in practical, if we need to use them all then using a for loop is inefficient. So luckily sklearn provides an API called GridSearchCV which does the exact same thing like line number 14.

# GridSearchCV

In [15]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel':['rbf', 'linear']
}, cv=5, return_train_score=False)

In [16]:
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00080113, 0.00113196, 0.00100942, 0.00084486, 0.00039163,
        0.00040555]),
 'std_fit_time': array([7.51665297e-04, 5.47890447e-04, 1.29005383e-05, 4.36897154e-04,
        4.79750355e-04, 4.96809036e-04]),
 'mean_score_time': array([0.00070033, 0.00019999, 0.00079575, 0.00091381, 0.00060554,
        0.00039854]),
 'std_score_time': array([0.00060135, 0.00039997, 0.0003979 , 0.00049466, 0.00049459,
        0.00048811]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

So messy to be observed.
# Using Panda's DataFrame to observe better

In [17]:
df = pd.DataFrame(clf.cv_results_)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000801,0.000752,0.0007,0.000601,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001132,0.000548,0.0002,0.0004,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001009,1.3e-05,0.000796,0.000398,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000845,0.000437,0.000914,0.000495,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000392,0.00048,0.000606,0.000495,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [18]:
df[['param_C','param_kernel', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score,rank_test_score
0,1,rbf,0.98,1
1,1,linear,0.98,1
2,10,rbf,0.98,1
3,10,linear,0.973333,4
4,20,rbf,0.966667,5
5,20,linear,0.966667,6


In [21]:
clf.best_score_

0.9800000000000001

In [22]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

We saw that GridSearchCV is providing us with conveinience than using loops. But there is another issue here. What if we have millions of data. 
In that case if we use GridSearchCV our computation cost will be much high and inefficent. 

To tackle this problem, use 'RandomizedSearchCV'. It will not try every single permutation and combination.

# RandomizedSearchCV

In [23]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel':['rbf', 'linear']
}, cv=5, return_train_score=False, n_iter=2)

rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,20,rbf,0.966667


# Choosing Best Model

In [24]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Making a format like json object using python dictionaries

In [25]:
model_params = {
    'svm':{
        'model':svm.SVC(gamma='auto'),
        'params':{
            'C':[1,10,20],
            'kernel':['rbf','linear']
        }
    },
    
    'random_forest':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    
    'logistic_regression':{
        'model':LogisticRegression(solver='liblinear', multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
    }
}

In [28]:
scores = []

for model_name, mps in model_params.items():
    clf = GridSearchCV(mps['model'], mps['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model_type':model_name,
        'best_score':clf.best_score_,
        'best_parameters':clf.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model_type','best_score','best_parameters'])
df

Unnamed: 0,model_type,best_score,best_parameters
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification