<font size=6><b>lec02. 모델 검증 (Validation)
* ref : https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
* ref : https://python-data-science.readthedocs.io/en/latest/evaluation.html    

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.metrics  import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [40]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold, StratifiedKFold

In [4]:
from sklearn.datasets import load_iris

# Data Load

In [5]:
dataset = load_iris()
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [6]:
dataset['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
dataset['target'].shape, dataset['target']

((150,),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [8]:
dataset['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
dataset['data'].shape, dataset['data'][:5]

((150, 4),
 array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]))

In [10]:
df = pd.DataFrame(dataset['data'], 
                  columns = ['sl','sw','pl','pw']
                  #columns = dataset['feature_names']
                 )
df['target'] = dataset['target']
df.head(2)

Unnamed: 0,sl,sw,pl,pw,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sl      150 non-null    float64
 1   sw      150 non-null    float64
 2   pl      150 non-null    float64
 3   pw      150 non-null    float64
 4   target  150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [12]:
df['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

# 학습 & 평가

In [13]:
y = df['target']
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.2, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [14]:
model = DecisionTreeClassifier(random_state=11)
model.fit(X_train, y_train)
pred = model.predict(X_test)
score = accuracy_score(y_test, pred)
print(score)

0.9333333333333333


In [15]:
X_train.index, X_test.index

(Int64Index([  0, 120, 122,  49,  29, 118, 105,  77,  36,  83,
             ...
              92, 109,  24,  82,  71,  76,  13,  81,  91,  80],
            dtype='int64', length=120),
 Int64Index([112, 145, 133,  56, 111,   9,  65,  15,  30,  63, 119,  62,  84,
             113, 139,  41, 103,  66, 110, 142,  69,   2,  22,  51,  12,  11,
             149,  75,  16,  85],
            dtype='int64'))

In [16]:
y_train.value_counts(), y_test.value_counts()

(0    41
 1    40
 2    39
 Name: target, dtype: int64,
 2    11
 1    10
 0     9
 Name: target, dtype: int64)

# 교차검증(cross validation)

ref : https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py

<table border=1 width=800>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_006.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_009.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_007.png">
        <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png">
    </td>
</tr>     
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_008.png">
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_012.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_010.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_011.png">
    </td>
</tr>   
</table>


In [17]:
# 0.9333333333333333  : 기본
# 0.9133333333333333  : KFold
# 0.9333333333333333  : StratifiedKFold

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 11)
for i , 