In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import datasets, svm
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['flower'] = iris.target

df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)

In [13]:
model = svm.SVC(kernel ='poly')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9666666666666667

In [14]:
from sklearn.model_selection import cross_val_score

cross_val_score(svm.SVC(kernel='linear'),iris.data, iris.target)


array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [15]:
cross_val_score(svm.SVC(kernel='rbf'),iris.data, iris.target)

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [16]:
cross_val_score(svm.SVC(kernel='poly'),iris.data, iris.target)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [18]:
## lets take down in simple steps

kernel = ['poly','rbf','linear']
C = [1,10,20]
avg_scores = {}

for k in kernel:
    for i in C:
        score = cross_val_score(svm.SVC(kernel=k,C=i),iris.data,iris.target)
        avg_scores[k+"_"+str(i)] = np.average(score)

avg_scores

{'poly_1': 0.9800000000000001,
 'poly_10': 0.9666666666666666,
 'poly_20': 0.9666666666666666,
 'rbf_1': 0.9666666666666666,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9800000000000001,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

## Best Way: GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(gamma='auto'),{'C':[1,10,20],'kernel':['poly','rbf','linear']},cv=5,return_train_score=False)
clf.fit(X_train,y_train)
k = clf.cv_results_
df = pd.DataFrame(k)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001622,0.001215,0.000606,0.000495,1,poly,"{'C': 1, 'kernel': 'poly'}",0.958333,1.0,0.958333,1.0,0.958333,0.975,0.020412,1
1,0.000808,0.000404,0.000413,0.000506,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.958333,0.916667,0.958333,1.0,1.0,0.966667,0.03118,5
2,0.0,0.0,0.0,0.0,1,linear,"{'C': 1, 'kernel': 'linear'}",0.958333,0.916667,0.958333,1.0,0.958333,0.958333,0.026352,7
3,0.001977,0.003954,0.001596,0.003193,10,poly,"{'C': 10, 'kernel': 'poly'}",0.958333,1.0,0.958333,1.0,0.916667,0.966667,0.03118,5
4,0.0016,0.0032,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.916667,0.958333,0.958333,1.0,0.958333,0.958333,0.026352,7
5,0.0,0.0,0.0,0.0,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,0.958333,0.958333,1.0,0.958333,0.975,0.020412,1
6,0.001601,0.003201,0.001599,0.003199,20,poly,"{'C': 20, 'kernel': 'poly'}",0.958333,1.0,0.958333,1.0,0.958333,0.975,0.020412,1
7,0.0,0.0,0.0,0.0,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.916667,0.958333,0.958333,1.0,0.958333,0.958333,0.026352,7
8,0.0,0.0,0.0016,0.0032,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.916667,1.0,0.958333,0.975,0.033333,1


In [28]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,poly,0.975
1,1,rbf,0.966667
2,1,linear,0.958333
3,10,poly,0.966667
4,10,rbf,0.958333
5,10,linear,0.975
6,20,poly,0.975
7,20,rbf,0.958333
8,20,linear,0.975


In [29]:
clf.best_params_

{'C': 1, 'kernel': 'poly'}

In [30]:
clf.best_score_

0.975

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [32]:
from sklearn.model_selection import RandomizedSearchCV


clf = RandomizedSearchCV(svm.SVC(gamma='auto'),{'C':[1,10,20],'kernel':['poly','rbf','linear']},cv=5,return_train_score=False,n_iter = 5)
clf.fit(X_train,y_train)
k = clf.cv_results_
df = pd.DataFrame(k)
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.958333
1,20,linear,0.975
2,1,poly,0.975
3,10,rbf,0.958333
4,1,rbf,0.966667


In [34]:
## lets see on different models with many parameters

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model_parameters  ={
    'svm':{
        'model':svm.SVC(gamma='auto'),
        'params':{
            'C':[1,10,20],
            'kernel':['poly','rbf','linear']
        }
    },


    'random_forest':{
        'model' : RandomForestClassifier(),
        'params':{
            'n_estimators' :[1,5,10,20,50,80,100]
        }
    },


    'logistic_reg':{
        'model':LogisticRegression(),
        'params':{
            'C':[1,5,10]
        }
    } 
}




In [36]:
scores = []

for model_name,model_params in model_parameters.items():
    clf = GridSearchCV(model_params['model'],model_params['params'],cv=5,return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model':model_name,
        'score':clf.best_score_,
        'best_params':clf.best_params_
    })

df = pd.DataFrame(scores,columns=['model','score','best_params'])
df    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,model,score,best_params
0,svm,0.975,"{'C': 1, 'kernel': 'poly'}"
1,random_forest,0.933333,{'n_estimators': 10}
2,logistic_reg,0.975,{'C': 5}
