In [27]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

In [3]:
flowers = load_iris()
dir(flowers)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
df = pd.DataFrame(flowers.data,columns=flowers.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
df['flower'] = flowers.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
df['flower'] = df.flower.apply(lambda x:flowers.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [12]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('flower',axis='columns'),df.flower,test_size=0.2)
x_train.shape,x_test.shape

((120, 4), (30, 4))

In [13]:
model = SVC(kernel='rbf',C=30,gamma='auto')
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9666666666666667

In [14]:
cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),flowers.data,flowers.target,cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [15]:
cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),flowers.data,flowers.target,cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [16]:
cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),flowers.data,flowers.target,cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

### The above task is manual and repetitive so there is algorithm GridSearchCV that can be used to automate hyper parameter tunning

In [22]:
classifier = GridSearchCV(SVC(gamma='auto',),{
    'C':[1,10,20,100],
    'kernel':['rbf','linear','poly']
},cv=5,return_train_score=False)
classifier.fit(flowers.data,flowers.target)


In [23]:
results = pd.DataFrame(classifier.cv_results_)
results[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'C': 1, 'kernel': 'rbf'}",0.98
1,"{'C': 1, 'kernel': 'linear'}",0.98
2,"{'C': 1, 'kernel': 'poly'}",0.966667
3,"{'C': 10, 'kernel': 'rbf'}",0.98
4,"{'C': 10, 'kernel': 'linear'}",0.973333
5,"{'C': 10, 'kernel': 'poly'}",0.966667
6,"{'C': 20, 'kernel': 'rbf'}",0.966667
7,"{'C': 20, 'kernel': 'linear'}",0.966667
8,"{'C': 20, 'kernel': 'poly'}",0.953333
9,"{'C': 100, 'kernel': 'rbf'}",0.96


In [24]:
classifier.best_score_

np.float64(0.9800000000000001)

In [25]:
classifier.best_params_

{'C': 1, 'kernel': 'rbf'}

### GridSearchCV is computationally costy because it tries combination of each parameter so there is another algorithm called RandomSearchCV

In [30]:
rs = RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20,100],
    'kernel':['rbf','linear','poly']
},cv=5,return_train_score=False,n_iter=5)
rs.fit(flowers.data,flowers.target)

In [31]:
results = pd.DataFrame(rs.cv_results_)
results[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'kernel': 'poly', 'C': 20}",0.953333
1,"{'kernel': 'linear', 'C': 100}",0.966667
2,"{'kernel': 'poly', 'C': 100}",0.946667
3,"{'kernel': 'linear', 'C': 1}",0.98
4,"{'kernel': 'linear', 'C': 10}",0.973333


In [33]:
rs.best_score_

np.float64(0.9800000000000001)

In [34]:
rs.best_params_

{'kernel': 'linear', 'C': 1}

### How to choose best model??

In [35]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [36]:
model_params = {
    'svm':{
        'model':SVC(gamma='auto'),
        'params':{
            'C':[1,10,20,100],
            'kernel':['rbf','linear','poly']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10,20]
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
    }
  }

In [37]:
scores = []
for model_name,m_params in model_params.items():
  classifier = GridSearchCV(m_params['model'],m_params['params'],cv=5,return_train_score=False)
  classifier.fit(flowers.data,flowers.target)
  scores.append({
      'model':model_name,
      'best_score':classifier.best_score_,
      'best_params':classifier.best_params_
  })



In [38]:
scores

[{'model': 'svm',
  'best_score': np.float64(0.9800000000000001),
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': np.float64(0.96),
  'best_params': {'n_estimators': 20}},
 {'model': 'logistic_regression',
  'best_score': np.float64(0.9666666666666668),
  'best_params': {'C': 5}}]