#### Hyperparameter Tuning
    1. Which model to use ?
    2. Given a model how to choose best hyperparameters ?

In [35]:
from sklearn.datasets import load_iris

iris = load_iris()

In [36]:
import pandas as pd 

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x : iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=10) 
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [38]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, Y_train)
    
model.score(X_test, Y_test)

0.9666666666666667

In [39]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5) # splitting data into 5 fold and computing scores on them
scores_svm = []

for train_index, test_index in kf.split(iris.data):
    X_train, X_test, Y_train, Y_test = iris.data[train_index], iris.data[test_index], iris.target[train_index], iris.target[test_index]
    
    model = SVC()   
    model.fit(X_train, Y_train)
    
    scores_svm.append(model.score(X_test, Y_test))

print(scores_svm)

[1.0, 1.0, 0.8333333333333334, 0.9333333333333333, 0.7]


In [40]:
# we can use cross val score for simplicity

from sklearn.model_selection import cross_val_score
print(cross_val_score(SVC(), iris.data, iris.target, cv=5)) # using SVM on 5 fold's

[0.96666667 0.96666667 0.96666667 0.93333333 1.        ]


In [41]:
# for hyper parmeter turning we have to find out best parameter values and GridSearchCv helps in that
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(SVC(gamma='auto'), {
    # parameter grid
    'C' : [1, 10, 20],
    'kernel': ['rbf', 'linear']
}, cv=5, return_train_score=False)

model.fit(iris.data, iris.target)
# model.cv_results_

df = pd.DataFrame(model.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002519,0.000574,0.00202,0.000335,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001865,0.000835,0.001325,0.000365,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.00147,0.00059,0.001085,0.000162,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001402,0.000486,0.001004,0.000637,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001169,0.000343,0.001003,8e-06,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.001608,0.000581,0.000823,0.000236,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [42]:
model.best_params_

{'C': 1, 'kernel': 'rbf'}

In [43]:
# if the number of paramters is very high then we use randmized search(RandomizedSearchCV) for fixed number of iterations(n_iter)
from sklearn.model_selection import RandomizedSearchCV

model = RandomizedSearchCV(SVC(gamma='auto'), {
    # parameter grid
    'C' : [1, 10, 20],
    'kernel': ['rbf', 'linear']
}, cv=5, return_train_score=False, n_iter=3)

model.fit(iris.data, iris.target)
# model.cv_results_

df = pd.DataFrame(model.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002678,0.000499,0.001408,0.000492,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002031,0.000587,0.000428,0.000526,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,3
2,0.001458,0.000904,0.00109,0.000112,linear,10,"{'kernel': 'linear', 'C': 10}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,2


In [44]:
# testing various models with different parameters
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_params={
    'svm':{
        'model': svm.SVC(gamma="auto"),
        'params': {
            'C' : [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1, 5, 10]
        }
    }
}

scores = []

for mn, mp in model_params.items():
    model = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    model.fit(iris.data, iris.target)
    scores.append({
        'model': mn,
        'best_score': model.best_score_,
        'best_parmeters': model.best_params_
    })

df = pd.DataFrame(scores)
df

Unnamed: 0,model,best_score,best_parmeters
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.973333,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}
