In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# [3월 2일]
---

## # 머신러닝 기초
---
- 사이킷런 활용
- 붓꽃 품종 예측

In [3]:
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
# 독립 변수는 label 제외 전부

iris_data = iris.data
iris_label = iris.target
iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)
iris_df['label'] = iris.target
display(iris_df.head())
iris_df['label'].value_counts()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


0    50
1    50
2    50
Name: label, dtype: int64

### # 훈련 모델과 예측
---

In [21]:
# 학습, 검증 데이터 분리
# X = 독립 변수, y = 종속 변수

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size = 0.2, random_state = 11)

In [164]:
# DT(Decision Tree) 객체 생성
dt_clf = DecisionTreeClassifier()

# 학습 수행
dt_clf.fit(X_train, y_train)

# 예측 수행
pred = dt_clf.predict(X_test)

# 예측 평가
print('예측 정확도 :', accuracy_score(y_test, pred))

예측 정확도 : 0.8


### # Model Selection 모듈
---

#### # 교차 검증
---
- K Fold
- Stratified K Fold
- cross_val_score

##### # KFold
---

In [331]:
from sklearn.model_selection import KFold

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state = 156)

# 5개의 폴드 세트를 분리
kfold = KFold(n_splits = 5)
cv_accuracy = []

n_iter = 0
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print('----------------------------------------------------------------------------')
    print(f'{n_iter}회차 / 정확도 : {accuracy},  학습용 데이터 : {train_size}, 검증용 데이터 : {test_size})')
    print(f'검증세트 인덱스 : \n{test_index}')
    print('----------------------------------------------------------------------------\n')
    cv_accuracy.append(accuracy)
    
print(f'<평균 검증 정확도> : {np.mean(cv_accuracy)}')

----------------------------------------------------------------------------
1회차 / 정확도 : 1.0,  학습용 데이터 : 120, 검증용 데이터 : 30)
검증세트 인덱스 : 
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
----------------------------------------------------------------------------

----------------------------------------------------------------------------
2회차 / 정확도 : 0.9667,  학습용 데이터 : 120, 검증용 데이터 : 30)
검증세트 인덱스 : 
[30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59]
----------------------------------------------------------------------------

----------------------------------------------------------------------------
3회차 / 정확도 : 0.8667,  학습용 데이터 : 120, 검증용 데이터 : 30)
검증세트 인덱스 : 
[60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
 84 85 86 87 88 89]
----------------------------------------------------------------------------

-------------------------------------------------------------------------

In [101]:
for i, j in kfold.split(features):
    display(i)
    print(features[[50, 51]])


array([ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
        63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
        76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
        89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
       102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
       115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
       128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
       141, 142, 143, 144, 145, 146, 147, 148, 149])

[[7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]]


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49, 100, 101,
       102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
       115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
       128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
       141, 142, 143, 144, 145, 146, 147, 148, 149])

[[7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]]


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

[[7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]]


##### # KFold의 문제점
---
- 데이터가 편향될 수 있다.

In [120]:
kfold = KFold(n_splits = 3)
n_iter = 0
for train_index, test_index in kfold.split(iris_df):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print('------------------------------')
    print(f'<{n_iter}회차 교차검증>\n')
    print(f'학습 레이블 데이터 분포 : \n{label_train.value_counts()}', '\n')
    print(f'검증 레이블 데이터 분포 : \n{label_test.value_counts()}')
    print('-------------------------------\n')

------------------------------
<1회차 교차검증>

학습 레이블 데이터 분포 : 
1    50
2    50
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
0    50
Name: label, dtype: int64
-------------------------------

------------------------------
<2회차 교차검증>

학습 레이블 데이터 분포 : 
0    50
2    50
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
1    50
Name: label, dtype: int64
-------------------------------

------------------------------
<3회차 교차검증>

학습 레이블 데이터 분포 : 
0    50
1    50
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
2    50
Name: label, dtype: int64
-------------------------------



##### # Stratified K Fold
---
- 원본 데이터의 레이블 분포를 먼저 고려한 뒤 분포와 동일한 비율로 학습과 검증 데이터 세트를 분배해준다.

In [135]:
from sklearn.model_selection import StratifiedKFold

iris_df_id = iris_df.drop(['label'], axis = 1)
skf = StratifiedKFold(n_splits = 3)
n_iter = 0

# for train_index, test_index in skf.split(iris_df_id, iris_df['label']):
for train_index, test_index in skf.split(iris_df, iris_df['label']): # 종속 변수를 적으면 독립 변수를 알아서 처리해준다.
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print('------------------------------')
    print(f'<{n_iter}회차 교차검증>\n')
    print(f'학습 레이블 데이터 분포 : \n{label_train.value_counts()}', '\n')
    print(f'검증 레이블 데이터 분포 : \n{label_test.value_counts()}')
    print('-------------------------------\n')

------------------------------
<1회차 교차검증>

학습 레이블 데이터 분포 : 
2    34
0    33
1    33
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
0    17
1    17
2    16
Name: label, dtype: int64
-------------------------------

------------------------------
<2회차 교차검증>

학습 레이블 데이터 분포 : 
1    34
0    33
2    33
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
0    17
2    17
1    16
Name: label, dtype: int64
-------------------------------

------------------------------
<3회차 교차검증>

학습 레이블 데이터 분포 : 
0    34
1    33
2    33
Name: label, dtype: int64 

검증 레이블 데이터 분포 : 
1    17
2    17
0    16
Name: label, dtype: int64
-------------------------------



##### # cross_val_score()
---
- 교차 검증을 보다 간편하게 수행
- 내부적으로 Straitied K Fold를 적용

In [38]:
from sklearn.model_selection import cross_val_score, cross_validate

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state = 11)

data = iris_data.data
label = iris_data.target

scores = cross_val_score(dt_clf, data, label, scoring = 'accuracy', cv = 3)
scores

array([0.98, 0.92, 0.98])

#### # GridSearchCV
---

In [183]:
from sklearn.model_selection import GridSearchCV

iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, 
                                                    test_size = 0.2, random_state = 121)

dtree = DecisionTreeClassifier()
parameters = {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}
grid_dtree = GridSearchCV(dtree, param_grid = parameters, cv = 3, refit = True)

grid_dtree.fit(X_train, y_train)
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1


In [185]:
# 훈련 데이터의 결과

print('GridSearchCV 최적 파라미터 :', grid_dtree.best_params_)
print(f'GridSearchCV 최고 정확도 : {grid_dtree.best_score_}') 

GridSearchCV 최적 파라미터 : {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.975


In [189]:
# 가장 정확도가 높은 훈련 데이터를 사용하여 검증을 실행

estimator = grid_dtree.best_estimator_
pred = estimator.predict(X_test)
print(f'테스트 데이터 정확도 : {accuracy_score(y_test, pred)}')

테스트 데이터 정확도 : 0.9666666666666667


### # 데이터 인코딩
---

In [269]:
# 레이블 인코더 : 숫자에 민감하지 않은 데이터에 사용

from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
print(labels)

# LabelEncoder().fit(items).classes_
# encoder.fit(items).classes_
print(encoder.classes_)

[0 1 4 5 3 2 2]
['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']


In [276]:
# 원핫 인코더 : 숫자에 민감한 데이터에 사용
# 희소행렬

from sklearn.preprocessing import OneHotEncoder

items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '믹서', '믹서']

encoder = LabelEncoder()
labels = encoder.fit_transform(items).reshape(-1, 1)
print(labels, '\n')

oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(labels)
print(oh_labels.toarray(), '\n')
print(oh_encoder.get_feature_names_out())

df = pd.DataFrame({'item': items})
pd.get_dummies(df)

[[0]
 [1]
 [4]
 [5]
 [3]
 [2]
 [2]] 

[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]] 

['x0_0' 'x0_1' 'x0_2' 'x0_3' 'x0_4' 'x0_5']


Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자렌지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,1,0,0,0
6,0,0,1,0,0,0


### # 피처 스케일링과 정규화
---

#### # StandardScaler
---

In [290]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [310]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

iris_scaled = scaler.fit_transform(iris_df)
iris_scaled_df = pd.DataFrame(data = iris_scaled, columns = iris_df.columns)
display(iris_scaled_df.head())
print(f'평균 : \n{iris_scaled_df.mean()}', '\n')
print(f'분산 : \n{iris_scaled_df.var()}')
print(f'최대값 : \n{iris_scaled_df.max()}', '\n')
print(f'최소값 : \n{iris_scaled_df.min()}', '\n')

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,-0.900681,1.019004,-1.340227,-1.315444,-1.224745
1,-1.143017,-0.131979,-1.340227,-1.315444,-1.224745
2,-1.385353,0.328414,-1.397064,-1.315444,-1.224745
3,-1.506521,0.098217,-1.283389,-1.315444,-1.224745
4,-1.021849,1.249201,-1.340227,-1.315444,-1.224745


평균 : 
sepal length (cm)   -4.736952e-16
sepal width (cm)    -7.815970e-16
petal length (cm)   -4.263256e-16
petal width (cm)    -4.736952e-16
label               -1.421085e-16
dtype: float64 

분산 : 
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
label                1.006711
dtype: float64
최대값 : 
sepal length (cm)    2.492019
sepal width (cm)     3.090775
petal length (cm)    1.785832
petal width (cm)     1.712096
label                1.224745
dtype: float64 

최소값 : 
sepal length (cm)   -1.870024
sepal width (cm)    -2.433947
petal length (cm)   -1.567576
petal width (cm)    -1.447076
label               -1.224745
dtype: float64 



#### # MinMaxScaler
---

In [321]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

iris_scaled = scaler.fit_transform(iris_df)
iris_scaled_df = pd.DataFrame(data = iris_scaled, columns = iris_df.columns)
display(iris_scaled_df.head())
print(f'평균 : \n{iris_scaled_df.mean()}', '\n')
print(f'분산 : \n{iris_scaled_df.var()}', '\n')
print(f'최대값 : \n{iris_scaled_df.max()}', '\n')
print(f'최소값 : \n{iris_scaled_df.min()}', '\n')

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,0.222222,0.625,0.067797,0.041667,0.0
1,0.166667,0.416667,0.067797,0.041667,0.0
2,0.111111,0.5,0.050847,0.041667,0.0
3,0.083333,0.458333,0.084746,0.041667,0.0
4,0.194444,0.666667,0.067797,0.041667,0.0


평균 : 
sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
label                0.500000
dtype: float64 

분산 : 
sepal length (cm)    0.052908
sepal width (cm)     0.032983
petal length (cm)    0.089522
petal width (cm)     0.100869
label                0.167785
dtype: float64 

최대값 : 
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
label                1.0
dtype: float64 

최소값 : 
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
label                0.0
dtype: float64 



In [326]:
# scaler.fit_transform()
iris_df['label']

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: label, Length: 150, dtype: int32