In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,cross_val_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.datasets import load_iris

# Here we have Iris Dataset

In [2]:
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Normalization

In [4]:
scaler = MinMaxScaler()
df2 = pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
df2.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


# Choosing The Best Base Algorithm for Iris Dataset Using Cross_Val_Score
- Random Forest came out to be the best algorithm for iris classification
- But For Demonsration purposes we are taking SVM as the base classifier as it has the most variations of parameters

In [13]:
classifier_names = ['Logistic_Regression','SVM','Decision_tree','Random_Forest','Gaussian_NB','Multinomial_NB','Bernaulli_NB']

mean_scores = [cross_val_score(LogisticRegression(),df2,iris.target,cv = 8).mean(),
cross_val_score(SVC(),df2,iris.target,cv = 8).mean(),
cross_val_score(DecisionTreeClassifier(),df2,iris.target,cv = 8).mean(),
cross_val_score(RandomForestClassifier(),df2,iris.target,cv = 8).mean(),
cross_val_score(GaussianNB(),df2,iris.target,cv = 8).mean(),
cross_val_score(MultinomialNB(),df2,iris.target,cv = 8).mean(),
cross_val_score(BernoulliNB(),df2,iris.target,cv = 8).mean()]

mean_score_list = pd.DataFrame({})
mean_score_list['Classifires'] = classifier_names
mean_score_list['mean_scores'] = mean_scores
mean_score_list

Unnamed: 0,Classifires,mean_scores
0,Logistic_Regression,0.927632
1,SVM,0.953947
2,Decision_tree,0.960526
3,Random_Forest,0.960526
4,Gaussian_NB,0.953947
5,Multinomial_NB,0.714547
6,Bernaulli_NB,0.35307


# But SVM has a lot of parameters, How will we know the optimal parameter to Use?

## # We can use looping to try every comination of parameters and calculate the cross val scores

- From Lower loop we can see that SVm with, kernel = 'rbf' and C = 3 gives the best result

In [16]:
C = [1,2,3,4,5]
Kernel = ['rbf','linear','poly','sigmoid']

for j in Kernel:
    for i in C:
        print(f'{j}__{i} : {cross_val_score(SVC(kernel=j,C = i),df2,iris.target,cv = 5).mean()}')

rbf__1 : 0.9600000000000002
rbf__2 : 0.9666666666666668
rbf__3 : 0.9733333333333334
rbf__4 : 0.9733333333333334
rbf__5 : 0.9733333333333334
linear__1 : 0.9666666666666666
linear__2 : 0.9733333333333334
linear__3 : 0.9666666666666668
linear__4 : 0.9666666666666668
linear__5 : 0.9666666666666668
poly__1 : 0.96
poly__2 : 0.9533333333333334
poly__3 : 0.96
poly__4 : 0.9533333333333334
poly__5 : 0.9533333333333334
sigmoid__1 : 0.28
sigmoid__2 : 0.24
sigmoid__3 : 0.22666666666666666
sigmoid__4 : 0.22666666666666666
sigmoid__5 : 0.22666666666666666


# GridSearchCV - does the work of driver code for us
- it uses cross val score

In [17]:
from sklearn.model_selection import GridSearchCV

In [19]:
gscv_svm = GridSearchCV(SVC(),{
    'C' : [1,2,3,4,5],
    'kernel' : ['rbf','linear','poly','sigmoid']
}, cv = 5)

gscv_svm.fit(df2,iris.target)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 2, 3, 4, 5],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']})

In [23]:
pd.DataFrame(gscv_svm.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.96
1,1,linear,0.966667
2,1,poly,0.96
3,1,sigmoid,0.28
4,2,rbf,0.966667
5,2,linear,0.973333
6,2,poly,0.953333
7,2,sigmoid,0.24
8,3,rbf,0.973333
9,3,linear,0.966667


In [27]:
# best score
gscv_svm.best_score_

0.9733333333333334

In [28]:
# best parameters
gscv_svm.best_params_

{'C': 2, 'kernel': 'linear'}

# RandomizedSearchCV
- GridSearchCV used every possible permutation and combination of our paramteres and finds out the mean score
- But when our parameters are numerous and their individual sizes are also huge, GridSearchCV will use a lot of computing power and time to find out the best possible HyperParameters, which is completely impractical
- To speed up the process, we use RandomizedSearchCV
- It does the same work as GridSearchCV, but instead of trying out all possible combinations of tha data, it takes random values of the parameters and finds mean scors of them

- n_iter parameter determines the number of times RandomizedSearchCV tries out different combinations

In [31]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
rscv_svm = RandomizedSearchCV(SVC(),{
    'C' : np.arange(1,10),
    'kernel' : ['rbf','linear','poly','sigmoid']
}, cv = 10, n_iter=10)

rscv_svm.fit(df2, iris.target)

RandomizedSearchCV(cv=10, estimator=SVC(),
                   param_distributions={'C': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'kernel': ['rbf', 'linear', 'poly',
                                                   'sigmoid']})

In [36]:
pd.DataFrame(rscv_svm.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,2,linear,0.973333
1,9,linear,0.973333
2,3,linear,0.973333
3,6,linear,0.973333
4,8,linear,0.973333
5,6,rbf,0.966667
6,4,sigmoid,0.22
7,6,sigmoid,0.22
8,2,rbf,0.966667
9,1,linear,0.966667


In [37]:
rscv_svm.best_score_

0.9733333333333334

In [38]:
rscv_svm.best_params_

{'kernel': 'linear', 'C': 2}

# Choosing The Best model and Doing Hyper Tuning Simultaneously

In [39]:
algos = {
    'Logistic_Regression' : [LogisticRegression(max_iter=1000),{
        'C' : np.arange(1,10)
    }],
    'SVM' : [SVC(),{
        'C' : np.arange(1,10),
        'kernel' : ['rbf','linear','poly','sigmoid']
    }],
    'DescisionTree' : [DecisionTreeClassifier(),{
        'criterion' : ['gini','entropy']
    }],
    'RandomForest' : [RandomForestClassifier(),{
        'criterion' : ['gini','entropy'],
        'n_estimators' : np.arange(100,500,50)
    }] 
}

In [45]:
algorithm = []
best_parameters = []
best_score = []

for i,j in algos.items():
    gscv = GridSearchCV(j[0],j[1],cv = 10)
    gscv.fit(df2,iris.target)
    
    algorithm.append(i)
    best_score.append(gscv.best_score_)
    best_parameters.append(gscv.best_params_)

In [46]:
for i in range(4):
    print(f'{algorithm[i]}---{best_parameters[i]}---{best_score[i]}')

Logistic_Regression---{'C': 3}---0.96
SVM---{'C': 1, 'kernel': 'poly'}---0.9733333333333334
DescisionTree---{'criterion': 'gini'}---0.9533333333333334
RandomForest---{'criterion': 'gini', 'n_estimators': 200}---0.9666666666666666
