# K-Fold Cross Validation

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('breast_cancer_wisconsin_processed_ok.csv')
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1.0


In [3]:
data = df.copy().values
r = np.random.RandomState(42)
r.shuffle(data)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1.0


# K-Fold Manual

In [4]:
# definição de índices de cada fold
k = 3
fold_size = data.shape[0]//k
curr = 0
all_idx = np.arange(0, data.shape[0])
p =  [None] * k
idx_train = [None] * k
idx_val  = [None] * k

for i in range(k):
    prev = curr
    curr += fold_size
    if (curr < data.shape[0]):
        curr += 1
    p[i] = np.arange(prev, curr)
    idx_val[i] = p[i]
    diff = np.setdiff1d(all_idx, p[i])
    idx_train[i] = diff
    n = idx_train[i].shape[0] + idx_val[i].shape[0]
    print(idx_train[i].shape, idx_val[i].shape, n)

(379,) (190,) 569
(379,) (190,) 569
(380,) (189,) 569


In [5]:
X_train = [None] * k
y_train = [None] * k
X_val =  [None] * k
y_val =  [None] * k

In [21]:
%%time
model =  [None] * k
y_pred =  [None] * k
acc =  [None] * k
for i in range(k):
    print(idx_train[i].shape, idx_val[i].shape)
    X_train[i] = data[idx_train[i], :-1]
    y_train[i] = data[idx_train[i], -1]
    X_val[i]  = data[idx_val[i], :-1]
    y_val[i]  = data[idx_val[i], -1]
    s = StandardScaler()
    X_train[i] = s.fit_transform(X_train[i])
    X_val[i] = s.transform(X_val[i])
    model[i] = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=42)
    model[i].fit(X_train[i], y_train[i])
    y_pred[i] = model[i].predict(X_val[i])
    acc[i] = accuracy_score(y_val[i], y_pred[i])

(379,) (190,)
(379,) (190,)
(380,) (189,)
CPU times: user 78.4 ms, sys: 3.63 ms, total: 82 ms
Wall time: 48 ms


In [7]:
acc

[0.9789473684210527, 0.9789473684210527, 0.9629629629629629]

In [8]:
np.mean(acc)

0.9736192332683561

## Usando KFold + cross_val_score do Scikit

### Sem Standardização

In [9]:
model2 = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=42)

In [10]:
data2 = df.values

In [11]:
X = data2[:, :-1]
y = data2[:, -1]

In [12]:
cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [13]:
for idx_train2, idx_val2 in cv.split(X, y):
    print(idx_train2.shape, idx_val2.shape)

(379,) (190,)
(379,) (190,)
(380,) (189,)


In [19]:
%%time
acc2 = cross_val_score(model2, X, y, cv=cv, scoring='accuracy')
print(acc2)

[0.96842105 0.94736842 0.93121693]
CPU times: user 3.48 s, sys: 25.5 ms, total: 3.51 s
Wall time: 1.87 s


In [15]:
np.mean(acc2)

0.9490021349670471

### Com standadização: usando Pipeline com StandardScaler

In [16]:
s = StandardScaler()
pipeline = Pipeline([('transformer', s), ('estimator', model2)])
acc3 = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
acc3

array([0.97894737, 0.97894737, 0.96296296])

In [17]:
np.mean(acc3)

0.9736192332683561