## Step 0: Setup

In [10]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.metrics import make_scorer, f1_score

## Step 1: Load dataset

In [11]:
iris = load_iris()

In [12]:
for key in iris:
    print(key)

data
target
frame
target_names
DESCR
feature_names
filename
data_module


In [13]:
X = iris.data
y = iris.target

In [14]:
# iris data as Dataframe
import pandas as pd

df = pd.DataFrame(X, columns = iris.feature_names)
df['target'] = y
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Step 2: initialise classification model

In [15]:
model = LogisticRegression(max_iter = 1000)

## Step 3: Stratified K fold

In [16]:
# initialise
stratified_kf = StratifiedKFold(
    n_splits = 5,
    shuffle = True,
    random_state = 11
)

using the utility function cross_val_score which automates training/testing across folds and returns performance scores. (below code)

In [17]:
scores = cross_val_score(
    estimator = model,     # model
    X = X,                 # samples
    y = y,                 # target
    cv = stratified_kf,    # cross val method
    scoring = "accuracy"   # performance metric
)

## Step 4: Result

In [18]:
print("Cross Validation Scores : ", scores)
for i in scores:
    print(i)

Cross Validation Scores :  [1.         0.96666667 0.9        0.96666667 0.96666667]
1.0
0.9666666666666667
0.9
0.9666666666666667
0.9666666666666667


In [19]:
print("Mean Accuracy : ", scores.mean())

Mean Accuracy :  0.96


So average mean accuracy with Stratified K fold is **0.96**

## question : why the accuracy is better here than kfold cv (01 ipynb) ?

**Answer** : 
* In both the notebooks, k = 5, and since iris is a balanced dataset it has 50 samples each class, 
* now since k = 5 (each chunk = 30 datapoints), and shuffling = True, it is not guaranteed that the class proportions of 1/3 is maintained in the chunks during **kfolds**, thus they might be misproportion
* and there can be a severe case where one of the class representation in those 30 points of chunk of testing is zero
* *Wheareas* in **Stratified K fold**, the class proportions of (1/3) is maintained in each chunk
* Thereby the variance is low and the accuracy increases