<font size=6><b>lec02. 모델 검증 (Validation)
* ref : https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
* ref : https://python-data-science.readthedocs.io/en/latest/evaluation.html    

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.metrics  import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [40]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold, StratifiedKFold

In [4]:
from sklearn.datasets import load_iris

# Data Load

In [5]:
dataset = load_iris()
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [6]:
dataset['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
dataset['target'].shape, dataset['target']

((150,),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [8]:
dataset['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
dataset['data'].shape, dataset['data'][:5]

((150, 4),
 array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]))

In [10]:
df = pd.DataFrame(dataset['data'], 
                  columns = ['sl','sw','pl','pw']
                  #columns = dataset['feature_names']
                 )
df['target'] = dataset['target']
df.head(2)

Unnamed: 0,sl,sw,pl,pw,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sl      150 non-null    float64
 1   sw      150 non-null    float64
 2   pl      150 non-null    float64
 3   pw      150 non-null    float64
 4   target  150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [12]:
df['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

# 학습 & 평가

In [13]:
y = df['target']
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.2, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [14]:
model = DecisionTreeClassifier(random_state=11)
model.fit(X_train, y_train)
pred = model.predict(X_test)
score = accuracy_score(y_test, pred)
print(score)

0.9333333333333333


In [15]:
X_train.index, X_test.index

(Int64Index([  0, 120, 122,  49,  29, 118, 105,  77,  36,  83,
             ...
              92, 109,  24,  82,  71,  76,  13,  81,  91,  80],
            dtype='int64', length=120),
 Int64Index([112, 145, 133,  56, 111,   9,  65,  15,  30,  63, 119,  62,  84,
             113, 139,  41, 103,  66, 110, 142,  69,   2,  22,  51,  12,  11,
             149,  75,  16,  85],
            dtype='int64'))

In [16]:
y_train.value_counts(), y_test.value_counts()

(0    41
 1    40
 2    39
 Name: target, dtype: int64,
 2    11
 1    10
 0     9
 Name: target, dtype: int64)

# 교차검증(cross validation)

ref : https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py

<table border=1 width=800>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_006.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_009.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_007.png">
        <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png">
    </td>
</tr>     
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_008.png">
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_012.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_010.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_011.png">
    </td>
</tr>   
</table>


In [17]:
# 0.9333333333333333  : 기본
# 0.9133333333333333  : KFold
# 0.9333333333333333  : StratifiedKFold

## KFold()
* <font color=red>회귀모델
<pre>
class sklearn.model_selection.KFold(n_splits=5, 
                                    shuffle=False, random_state=None)


In [18]:
# age = 20
# gen = "남"
# print("나이", 20 , "세" , "성별", gen)
# print(f"나이:{age}세  성별:{gen}")

* y = df['target']
* X = df.drop('target', axis=1)

In [19]:
score_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=11)
for i,  (tridx, teidx) in enumerate(kf.split(X)):
    print(f"Fold {i} : ", end=" ")
    
    # X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.2, random_state=11)
    X_train = X.loc[tridx] 
    y_train = y.loc[tridx] 
    X_test  = X.loc[teidx]  
    y_test  = y.loc[teidx]  
    
#     X_train, y_train  = X.loc[tridx], y.loc[tridx] 
#     X_test,  y_test   = X.loc[teidx] , y.loc[teidx] 
# -------------------------------------------------------
#     X_train, X_test = X.loc[tridx],  X.loc[teidx]  
#     y_train, y_test = y.loc[tridx],  y.loc[teidx]
    
    #print(f"  Train index={tridx}")
    # print(f"  Test  index={teidx}")
    # print(df.loc[teidx, 'target'].value_counts())
    
    model = DecisionTreeClassifier(random_state=11)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    score = accuracy_score(y_test, pred)
    print(score)
    score_list.append(score)
    print("--"*20)
print( score_list )
print(np.mean(score_list))    

Fold 0 :  0.9333333333333333
----------------------------------------
Fold 1 :  0.9
----------------------------------------
Fold 2 :  1.0
----------------------------------------
Fold 3 :  0.9666666666666667
----------------------------------------
Fold 4 :  0.9333333333333333
----------------------------------------
[0.9333333333333333, 0.9, 1.0, 0.9666666666666667, 0.9333333333333333]
0.9466666666666667


## StratifiedKFold()
* <font color=red>분류모델
<pre>
class sklearn.model_selection.StratifiedKFold(n_splits=5, 
                                              shuffle=False, random_state=None)


In [20]:
# skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
# for i,  (train_index, test_index) in enumerate(skf.split(X, y)):
#     print(f"Fold {i}")
#     #rint(f"  Train index={train_index}")
#     print(f"  Test  index={test_index}")
#     print(df.loc[test_index, 'target'].value_counts())
#     print("----"*20)

In [21]:
score_list = []

# kf = KFold(n_splits=5, shuffle=True, random_state=11)
# for i,  (tridx, teidx) in enumerate(kf.split(X)):
    
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
for i,  (tridx, teidx) in enumerate(kf.split(X,y)):
    
    print(f"Fold {i} : ", end=" ")
    
    # X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.2, random_state=11)
    X_train = X.loc[tridx] 
    y_train = y.loc[tridx] 
    X_test  = X.loc[teidx]  
    y_test  = y.loc[teidx]  
    
#     X_train, y_train  = X.loc[tridx], y.loc[tridx] 
#     X_test,  y_test   = X.loc[teidx] , y.loc[teidx] 
# -------------------------------------------------------
#     X_train, X_test = X.loc[tridx],  X.loc[teidx]  
#     y_train, y_test = y.loc[tridx],  y.loc[teidx]
    
    #print(f"  Train index={tridx}")
    # print(f"  Test  index={teidx}")
    # print(df.loc[teidx, 'target'].value_counts())
    
    model = DecisionTreeClassifier(random_state=11)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    score = accuracy_score(y_test, pred)
    print(score)
    score_list.append(score)
    print("--"*20)
print(np.mean(score_list))    

Fold 0 :  0.9333333333333333
----------------------------------------
Fold 1 :  0.9666666666666667
----------------------------------------
Fold 2 :  0.8666666666666667
----------------------------------------
Fold 3 :  0.9666666666666667
----------------------------------------
Fold 4 :  0.9666666666666667
----------------------------------------
0.9400000000000001


## cross_val_score()
* <font color=red> test_score만 리스트로 준다
     
<pre>
<b>sklearn.model_selection.cross_val_score(estimator, <font color=red>X, y=None, </font>
scoring=None, <font color=red>cv=None</font>, fit_params=None</b>
,n_jobs=None, verbose=0, groups=None, , pre_dispatch='2*n_jobs', error_score=nan)


In [22]:
model = DecisionTreeClassifier(random_state=11)
score_list = cross_val_score(model, X, y, scoring='accuracy' , cv=5)
print( score_list )
print( np.mean(score_list) )

[0.96666667 0.96666667 0.9        0.96666667 1.        ]
0.9600000000000002


In [23]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
model = DecisionTreeClassifier(random_state=11)
score_list = cross_val_score(model, X, y, scoring='accuracy' , cv=kf)
print( score_list )
print( np.mean(score_list) )

[0.93333333 0.96666667 0.86666667 0.96666667 0.96666667]
0.9400000000000001


## cross_validate()
* <font color=red> test_score 이외 딕셔너리로 준다

    * 점수 : train_score, test_score 
    * 시간 : fit_time, score_time

In [45]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
model = DecisionTreeClassifier(random_state=11)
# score_list = cross_val_score(model, X, y, scoring='accuracy' , cv=kf)
score_dict = cross_validate(model, X, y, scoring='accuracy' , cv=kf, return_train_score=True)
print( score_dict )
print( np.mean(score_dict['test_score']) )

{'fit_time': array([0.0029912 , 0.00299263, 0.00299287, 0.00299048, 0.0029912 ]), 'score_time': array([0.00099754, 0.00099826, 0.00299144, 0.00299239, 0.00099707]), 'test_score': array([0.93333333, 0.96666667, 0.86666667, 0.96666667, 0.96666667]), 'train_score': array([1., 1., 1., 1., 1.])}
0.9400000000000001


## GridSearchCV()
* <font color=red> S.KFold() + 모델 하이퍼파라미터 튜닝
    
<pre>
<b>class sklearn.model_selection.GridSearchCV(estimator,   
scoring=None,  <font color=red>cv=None</font>, param_grid,        refit=True, </b>
n_jobs=None, verbose=0, pre_dispatch='2*n_jobs', 
error_score=nan, return_train_score=False)


*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0

In [27]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
model = DecisionTreeClassifier(random_state=11)  #min_samples_split=2, min_samples_leaf=1,
mydic = {"min_samples_split":[1,2,3], 
         "min_samples_leaf":[1,2,3]}
models = GridSearchCV(model, scoring="accuracy", cv=kf ,  param_grid = mydic)
models.fit(X_train, y_train)

In [31]:
# ss = pd.DataFrame(models.cv_results_)
# ss

In [32]:
models.best_estimator_

In [33]:
models.best_score_

0.9666666666666668

In [34]:
models.best_params_

{'min_samples_leaf': 3, 'min_samples_split': 1}

In [38]:
# model = DecisionTreeClassifier(random_state=11, 
#                                min_samples_leaf=3, 
#                                min_samples_split=1)
# pred = models.predict(testcsv) --> 공모전문제
# pred답안제출

## 그외 (LOOCV)