#### Use iris dataset and build the k-nn model using k-fold cross validation, stratified cross fold validation. Compare the results of train_test_split, k-fold and skfold validation techniques. 

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/iris.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
X = data.drop('species', axis=1)
y = data['species']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Finding best estimators
knn = KNeighborsClassifier(n_jobs=-1)
params = {'n_neighbors':np.arange(3,12,2),
         'leaf_size': list(range(1,5)),
         'p': [1,2]}

gs = GridSearchCV(estimator=knn, param_grid=params, scoring='f1_micro', cv=5)

In [7]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid={'leaf_size': [1, 2, 3, 4],
                         'n_neighbors': array([ 3,  5,  7,  9, 11]),
                         'p': [1, 2]},
             scoring='f1_micro')

In [8]:
print(gs.best_estimator_)

KNeighborsClassifier(leaf_size=1, n_jobs=-1, n_neighbors=3, p=1)


## Compare the results of train_test_split, k-fold and skfold validation techniques.

### 1. train_test_split

In [9]:
y_pred = gs.predict(X_test)
print("Testset F1-Score:", metrics.f1_score(y_test, y_pred, average='micro'))

Testset F1-Score: 1.0


### 2. k_fold

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print("KFold F1-Score:", cross_val_score(gs, X_train, y_train,
                                         scoring='f1_micro', cv=kf).mean())

KFold F1-Score: 0.95


### 3. skfold

In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("StratifiedKFold F1-Score:", cross_val_score(gs, X_train, y_train,
                                         scoring='f1_micro', cv=skf).mean())

StratifiedKFold F1-Score: 0.9333333333333333
