In [348]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

### iris dataset from sklearn.datasets

In [349]:
#object of dataset
iris=load_iris()

In [350]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [351]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [352]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [353]:
df=pd.DataFrame(iris.data,columns=iris.feature_names)

In [354]:
df['target']=iris.target

In [355]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [356]:
# mapping target name with the dataframe on the basis of target column
df['target_name']=list(map(lambda x:iris.target_names[x],df['target']))

In [357]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


### Dataset splitting

In [358]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']],df['target_name'])

## K Fold

In [359]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=6)

In [373]:
for train_index,test_index in kf.split(df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name']):
    
    print(train_index,test_index) # array of index of train and test dataset

[ 25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42
  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 

In [374]:
#fuction which return score of different function on different test set(K FOLD)
x_train,x_test,y_train,y_test
def get_score_kf(model):
    for train_index,test_index in kf.split(df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name']):
        x_train=df.iloc[train_index,0:4]
        x_test=df.iloc[test_index,0:4]
        y_train=df.iloc[train_index,5]
        y_test=df.iloc[test_index,5]
        model.fit(x_train,y_train)
        #print(y_train)
        print(model.score(x_test,y_test))

In [375]:
#passing model to function
model1=RandomForestClassifier()
get_score_kf(model1)

1.0
1.0
0.92
0.92
0.92
0.84


## Stratified K fold

In [376]:
from sklearn.model_selection import StratifiedKFold
skf=StratifiedKFold(n_splits=3)

In [377]:
#fuction which return score of different function on different test set(stratified K FOLD)
x_train,x_test,y_train,y_test
def get_score_skf(model):
    for train_index,test_index in skf.split(df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name']):
        x_train=df.iloc[train_index,0:4]
        x_test=df.iloc[test_index,0:4]
        y_train=df.iloc[train_index,5]
        y_test=df.iloc[test_index,5]
        model.fit(x_train,y_train)
        #print(y_train)
        print(model.score(x_test,y_test))

In [378]:
model2=RandomForestClassifier()
get_score_skf(model2)

0.98
0.94
0.98


# OR
### By using cross_val_score(Cross Validation) function

In [379]:
from sklearn.model_selection import cross_val_score

In [380]:
# random forest performance using cross_val_score
cross_val_score(RandomForestClassifier(n_estimators=100),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)

array([0.98, 0.94, 0.96])

In [381]:
# Logistic regression model performance using cross_val_score
cross_val_score(LogisticRegression(),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.98, 0.96, 0.98])

In [382]:
# svm model performance using cross_val_score
cross_val_score(SVC(),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)

array([0.96, 0.98, 0.94])

## Parameter tunning using k fold cross validation

In [383]:
res1=cross_val_score(RandomForestClassifier(n_estimators=3),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)
np.average(res1)

0.9666666666666667

In [384]:
res2=cross_val_score(RandomForestClassifier(n_estimators=5),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)
np.average(res2)

0.96

In [385]:
res1=cross_val_score(RandomForestClassifier(n_estimators=10),df[['sepal length (cm)','sepal width (cm)','petal length (cm)'\
                                           ,'petal width (cm)']],df['target_name'],cv=3)
np.average(res1)

0.9533333333333333

* we can find out which value of n_estimators gives good score