In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import statsmodels.api as sm # statsmodel 기본 import
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms 
import sklearn as sk
from patsy import dmatrix

  from pandas.core import datetools


##### Scikit-Learn의 교차 검증 기능

In [None]:
# 단순 데이터의 분리

In [3]:
X = np.arange(10).reshape((5, 2))
X

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [4]:
y = np.arange(5)
y

array([0, 1, 2, 3, 4])

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
X_train

array([[4, 5],
       [0, 1],
       [6, 7]])

In [7]:
y_train

array([2, 0, 3])

In [8]:
X_test

array([[2, 3],
       [8, 9]])

In [9]:
y_test

array([1, 4])

##### Cross Validation

In [None]:
#K-fold CV : 데이터셋을 K개의 sub-set으로 분리, 
#분리된 subset중 1개 제외, K-1개를 trainingset으로 K개의 모형 추정

In [10]:
N = 5
X = np.arange(8 * N).reshape(-1, 2) * 10
y = np.hstack([np.ones(N), np.ones(N)*2, np.ones(N) * 3, np.ones(N)*4])
print("X:\n", X, sep="")
print("y:\n", y, sep="")

X:
[[  0  10]
 [ 20  30]
 [ 40  50]
 [ 60  70]
 [ 80  90]
 [100 110]
 [120 130]
 [140 150]
 [160 170]
 [180 190]
 [200 210]
 [220 230]
 [240 250]
 [260 270]
 [280 290]
 [300 310]
 [320 330]
 [340 350]
 [360 370]
 [380 390]]
y:
[ 1.  1.  1.  1.  1.  2.  2.  2.  2.  2.  3.  3.  3.  3.  3.  4.  4.  4.
  4.  4.]


In [11]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=3, shuffle=True, random_state=0)
for train_index, test_index in cv.split(X):
    print("test_index :", test_index)
    print("." * 80)
    print("train_index:", train_index)
    print("=" * 80)

test_index : [ 1  6  8 10 17 18 19]
................................................................................
train_index: [ 0  2  3  4  5  7  9 11 12 13 14 15 16]
test_index : [ 2  4  5  7  9 13 14]
................................................................................
train_index: [ 0  1  3  6  8 10 11 12 15 16 17 18 19]
test_index : [ 0  3 11 12 15 16]
................................................................................
train_index: [ 1  2  4  5  6  7  8  9 10 13 14 17 18 19]


In [None]:
# Leave-One-Out(LOO) : 하나의 sample만을 test set으로 남긴다.

In [12]:
from sklearn.model_selection import LeaveOneOut

cv = LeaveOneOut()

for train_index, test_index in cv.split(X):
    print("test X:", X[test_index])
    print("." * 80)
    print("test y:", y[test_index])
    print("=" * 80)

test X: [[ 0 10]]
................................................................................
test y: [ 1.]
test X: [[20 30]]
................................................................................
test y: [ 1.]
test X: [[40 50]]
................................................................................
test y: [ 1.]
test X: [[60 70]]
................................................................................
test y: [ 1.]
test X: [[80 90]]
................................................................................
test y: [ 1.]
test X: [[100 110]]
................................................................................
test y: [ 2.]
test X: [[120 130]]
................................................................................
test y: [ 2.]
test X: [[140 150]]
................................................................................
test y: [ 2.]
test X: [[160 170]]
......................................................................

In [None]:
#ShuffleSplit : 중복된 데이터를 허용

In [13]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=.5, random_state=0)

for train_index, test_index in cv.split(X):
    print("test X:\n", X[test_index])
    print("=" * 20)

test X:
 [[360 370]
 [ 20  30]
 [380 390]
 [160 170]
 [200 210]
 [340 350]
 [120 130]
 [260 270]
 [ 80  90]
 [ 40  50]]
test X:
 [[220 230]
 [ 20  30]
 [360 370]
 [340 350]
 [ 40  50]
 [240 250]
 [380 390]
 [320 330]
 [200 210]
 [  0  10]]
test X:
 [[300 310]
 [260 270]
 [240 250]
 [100 110]
 [220 230]
 [ 40  50]
 [160 170]
 [120 130]
 [ 60  70]
 [340 350]]
test X:
 [[360 370]
 [  0  10]
 [260 270]
 [ 40  50]
 [ 60  70]
 [340 350]
 [140 150]
 [240 250]
 [280 290]
 [320 330]]
test X:
 [[140 150]
 [ 20  30]
 [ 40  50]
 [380 390]
 [100 110]
 [360 370]
 [160 170]
 [340 350]
 [300 310]
 [320 330]]


##### 교차평가시행 (cross_val_score()명령)

In [15]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X, y, coef = make_regression(n_samples=1000, n_features=1, noise=20, coef=True, random_state=0)

model = LinearRegression()
cv = KFold(10)

scores = np.zeros(10)
for i, (train_index, test_index) in enumerate(cv.split(X)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    scores[i] = r2_score(y_test, y_pred)
    
scores



array([ 0.95636425,  0.94908323,  0.93880683,  0.92906829,  0.93119768,
        0.95362566,  0.93217768,  0.94308775,  0.94579305,  0.94749884])

In [16]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, X, y, scoring="r2", cv=cv)

array([ 0.95636425,  0.94908323,  0.93880683,  0.92906829,  0.93119768,
        0.95362566,  0.93217768,  0.94308775,  0.94579305,  0.94749884])