In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Cross Validation
<img src="https://scikit-learn.org/stable/_images/grid_search_workflow.png" width="300px" height="300px">

- __Cross validation__ bertujuan : 
    - Menvalidasi model, dengan membagi training data
    - Mencari nilai parameter terbaik


- Step-by-step cross validation :
    1. Splitting dataset: training & testing
    2. Data training displit : training & validasi
    3. Misal data training displit menjadi 3 bagian: A, B, C
        - Buat model dengan data train : AB lalu divalidasi C, kmd cek evaluasi metrik misal akurasinya
        - Buat model dengan data train : AC lalu divalidasi B, kmd cek evaluasi metrik misal akurasinya
        - Buat model dengan data train : BC lalu divalidasi A, kmd cek evaluasi metrik misal akurasinya
    4. Didapatkan 3 model dengan 3 nilai akurasi, hitung rata-rata akurasinya
    
<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width="350px" height="350px">

<hr>

### K-Fold CV (Cross Validation)

In [5]:
from sklearn.model_selection import KFold

In [6]:
x = np.arange(1,10)
x

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

__K-Fold CV__ dengan $k = 3$ memiliki arti:
  - Data $x$ akan displit/fold menjadi 3 lipatan
  - 2 bagian secara bergantian akan digunakan sebagai data training
  - 1 bagian secara bergantian akan digunakan sebagai data validasi
    

In [7]:
k = KFold(n_splits = 3)
k.split(x) # hasilnya berupa object yang dapat dilakukan looping 

<generator object _BaseKFold.split at 0x0000025E6619D970>

In [9]:
for train, validate in k.split(x):
    print(train,validate) # hasil berupa index

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


Hasil K-Fold CV $k = 3$ pada $x$ :
- Split 1 : training = [4,5,6,7,8,9] validasi: [1,2,3]
- Split 2 : training = [1,2,3,4,5,6] validasi: [4,5,6]
- Split 3 : training = [1,2,3,7,8,9] validasi: [7,8,9]

In [10]:
for train, validate in k.split(x):
    print([x[i] for i in train], [x[i] for i in validate])

[4, 5, 6, 7, 8, 9] [1, 2, 3]
[1, 2, 3, 7, 8, 9] [4, 5, 6]
[1, 2, 3, 4, 5, 6] [7, 8, 9]


In [11]:
# k maksimum = sejumlah value/element

k = KFold(n_splits = len(x))
for train, validate in k.split(x):
    print([x[i] for i in train], [x[i] for i in validate])

[2, 3, 4, 5, 6, 7, 8, 9] [1]
[1, 3, 4, 5, 6, 7, 8, 9] [2]
[1, 2, 4, 5, 6, 7, 8, 9] [3]
[1, 2, 3, 5, 6, 7, 8, 9] [4]
[1, 2, 3, 4, 6, 7, 8, 9] [5]
[1, 2, 3, 4, 5, 7, 8, 9] [6]
[1, 2, 3, 4, 5, 6, 8, 9] [7]
[1, 2, 3, 4, 5, 6, 7, 9] [8]
[1, 2, 3, 4, 5, 6, 7, 8] [9]


<hr>

### Cross Validation Example : Classification with Iris DataSet


__1. Splitting dataset: training & testing (0.8)__

In [12]:
from sklearn.datasets import load_iris

In [15]:
data = load_iris()

dfIris = pd.DataFrame(
    data['data'],
    columns = ['SL','SW','PL','PW']
)
dfIris['target'] = data['target']
dfIris.head()

Unnamed: 0,SL,SW,PL,PW,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [24]:
from sklearn.model_selection import train_test_split
xtr,xts,ytr,yts = train_test_split(dfIris[['SL','SW','PL','PW']], dfIris['target'], train_size=.8)

__2. K-fold CV dengan $k = 6$ diterapkan ke Logistic Regression__

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
# Gabungkan xtr + ytr dan xts + yts

dtrain = pd.concat([xtr,ytr], axis=1)
dtes = pd.concat([xts,yts], axis=1)

print(len(dtrain), len(dtes))

120 30


In [29]:
# data training di-folding dengan k-6, diterapkan ke LogisticRegression
k = KFold(n_splits = 6)
score = []
for train,validate in k.split(dtrain):
    xtrain = dtrain.iloc[train][['SL','SW','PL','PW']]
    ytrain = dtrain.iloc[train]['target']
    xvalid = dtrain.iloc[validate][['SL','SW','PL','PW']]
    yvalid = dtrain.iloc[validate]['target']
    #display(xtrain)

    # train model dengan xtrain, ytrain
    model = LogisticRegression()
    model.fit(xtrain,ytrain)
    
    # check score model dengan xvalid,yvalid
    score.append(model.score(xvalid,yvalid))

print(score)
print(np.mean(score))

# kalau data ga balance bisa menggunakan balance acccuracy score

[0.95, 1.0, 1.0, 0.9, 0.95, 1.0]
0.9666666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

<hr>

__3. Sklearn ```cross_val_score()```__


In [30]:
from sklearn.model_selection import cross_val_score

In [33]:
cv_score = cross_val_score(
    LogisticRegression(),
    xtr,
    ytr,
    cv = 6
)
print(cv_score)
print(np.mean(cv_score))

[0.95 1.   1.   0.95 1.   0.95]
0.9750000000000001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt