# 과적합, 분산 편향 트레이드오프, 교차 검증

In [1]:
#필요한 라이브러리 임포트

import numpy as np
import pandas as pd

# 교차검증

## 1. 사이킷런의 model_selection의 KFold()를 사용하는 경우(For loop 사용)

#### 폴드를 분리할 객체 생성

In [2]:
from sklearn.model_selection import KFold
kfold = KFold(5)

#### 데이터를 준비하고 회귀 모형 객체를 생성

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

#### split()함수를 호출하여 폴드별로 분리될 행 인덱스 세트를 구함

In [7]:
from sklearn.metrics import r2_score

r2_scores = []

for train_idx, test_idx in kfold.split(X):
    X_train,X_test = X[train_idx],X[test_idx]
    y_train,y_test = y[train_idx],y[test_idx]
    
    reg=lr.fit(X_train,y_train)
    
    y_pred = reg.predict(X_test)
    r2_scores.append(r2_score(y_test,y_pred))

In [9]:
import numpy as np

for i,r2 in enumerate(r2_scores):
    print(i+1,": R2 - {:.3f}".format(r2))
print("average R2: ", np.round(np.mean(r2_scores),3))

1 : R2 - 0.430
2 : R2 - 0.523
3 : R2 - 0.483
4 : R2 - 0.427
5 : R2 - 0.550
average R2:  0.482


## 2. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 without shuffling:
- for loop 필요 없음

In [10]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()


r2_scores = cross_val_score(lr,X,y,cv=5)


print("R2: ", np.round(r2_scores,3))
print("average R2: ", np.round(np.mean(r2_scores),3))

R2:  [0.43  0.523 0.483 0.427 0.55 ]
average R2:  0.482


## 3. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 with shuffling

In [11]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

kfold = KFold(3,shuffle=True, random_state=0)
r2_scores = cross_val_score(lr,X,y,cv=kfold)


print("R2: ", np.round(r2_scores,3))
print("average R2: ", np.round(np.mean(r2_scores),3))

R2:  [0.404 0.521 0.544]
average R2:  0.49
