### k-Fold

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

x_data=np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]  
])

In [2]:
kf = KFold(n_splits=5)

In [3]:
for train_index, test_index in kf.split(x_data):
    print('train_index : ', train_index)
    print('test_index : ', test_index)

train_index :  [ 3  4  5  6  7  8  9 10 11]
test_index :  [0 1 2]
train_index :  [ 0  1  2  6  7  8  9 10 11]
test_index :  [3 4 5]
train_index :  [ 0  1  2  3  4  5  8  9 10 11]
test_index :  [6 7]
train_index :  [ 0  1  2  3  4  5  6  7 10 11]
test_index :  [8 9]
train_index :  [0 1 2 3 4 5 6 7 8 9]
test_index :  [10 11]


#### K-Fold 교차검증 -> 보통 회귀 문제에서 사용됨
- 학습 데이터와 테스트 데이터를 K개의 세트로 나누어 검증하는 방법
- 데이터셋이 굉장히 적을 때 훈련데이터를 어떻게든 최대한 늘려보려고 사용되기도 하는 방법
- 여러 개의 훈련 테스트 짝으로 검증과정을 거침

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#### 2.데이터 수집

In [5]:
x_data=np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]  
])

y_data = np.array([3, 5, 7, 10, 12, 7, 13, 13, 12, 13, 12, 6])

#### 3.데이터 전처리
#### 4.EDA
#### 5-7.모델링(모델선택, 학습, 평가)

In [6]:
lr = LinearRegression()

train_scores = []
test_scores = []

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x_data) : #5번 검증
    x_train = np.array(x_data)[train_index]
    y_train = np.array(y_data)[train_index]
    x_test = np.array(x_data)[test_index]
    y_test = np.array(y_data)[test_index]
    
    model_kf = LinearRegression()
    model_kf.fit(x_train, y_train)
    
    score = model_kf.score(x_train, y_train)
    train_scores.append(score)
    
    score = model_kf.score(x_test, y_test)
    test_scores.append(score)

In [7]:
print(train_scores)
print(test_scores)

[0.9522707858769932, 0.9469593697441799, 0.9446524178499608, 0.9232432525564045, 0.9166499001004778]
[-1.1475590101753324, 0.56847222331606, 0.0, -11.7747639790487, 0.9602035173350366]


In [8]:
print(np.array(train_scores).mean())
print(np.array(test_scores).mean())

0.9367551452256032
-2.278729449714587


#### cross_validation

In [9]:
from sklearn.model_selection import cross_validate

In [10]:
model = LinearRegression()

In [11]:
cv_results = cross_validate(model, x_data, y_data)

In [12]:
print(cv_results['test_score'].mean())

-2.278729449714587


In [13]:
df = pd.DataFrame(cv_results)
df = df.sort_values(by='test_score', ascending=False)

#### cross_val_score

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
model = LinearRegression()
model.fit(x_data, y_data)

LinearRegression()

In [16]:
cv_score = cross_val_score(model, x_data, y_data, cv=5)

In [17]:
print('cv mean score : ', cv_score.mean())

cv mean score :  -2.278729449714587


### 분류

In [18]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [19]:
x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]  
])

y_data = np.array([2, 2, 2, 1, 1, 2, 0, 0, 0, 1, 0, 2])

labels = ['A', 'B', 'C']

In [20]:
model = LogisticRegression()
#내부적으로 알아서 학습
cv_results = cross_validate(model, x_data, y_data, return_estimator=True)
print(cv_results['test_score'].mean())

0.9333333333333332




In [21]:
df = pd.DataFrame(cv_results)
df = df.sort_values(by='test_score', ascending=False)
df

Unnamed: 0,fit_time,score_time,estimator,test_score
1,0.003535,0.0,LogisticRegression(),1.0
2,0.003034,0.0,LogisticRegression(),1.0
3,0.00299,0.000995,LogisticRegression(),1.0
4,0.00299,0.0,LogisticRegression(),1.0
0,0.003996,0.0,LogisticRegression(),0.666667


### 계측정 k-겹 교차검증(Stratified k-fold cross validation)
- 분류 모델에 적용
- k-겹 교차검증 모델은 k-fold가 원본 데이터 집합의 레이블 분포를 학습 및 검증데이터 세트에 제대로 분해하지 못하는 문제를 해결해줌
- target값(정답값) = 레이블 / 클래스의 속성값의 개수를 골고루 넣어주게 됨