# 교차검증

In [2]:
# 테스트 세트를 사용하지 않고 모델이 과적합인지 과소적합인지 즉 과적합을 판단방법

In [4]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/bigdataleeky/python/main/wine.csv")
df.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [5]:
x = df.iloc[:,:-1]
x.head()

Unnamed: 0,alcohol,sugar,pH
0,9.4,1.9,3.51
1,9.8,2.6,3.2
2,9.8,2.3,3.26
3,9.8,1.9,3.16
4,9.4,1.9,3.51


In [8]:
y = df.iloc[:,-1]
y = y.astype(int)
y.value_counts()

1    4898
0    1599
Name: class, dtype: int64

In [11]:
# 최종테스트 데이터
from sklearn.model_selection import train_test_split
x_train,x_target,y_train,y_target = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape, y_train.shape

((5197, 3), (5197,))

In [10]:
# 훈련용데이터를 - 훈련용 + 검증용  8:2
x_train2,x_target2,y_train2,y_target2 =  train_test_split(x_train,y_train,test_size=0.2,random_state=42)

In [12]:
x_train2.shape, y_train2.shape

((4157, 3), (4157,))

In [13]:
from sklearn.tree import DecisionTreeClassifier
dc =  DecisionTreeClassifier(random_state=42)
dc.fit(x_train2, y_train2)
dc.score(x_train2, y_train2), dc.score(x_target2, y_target2)

(0.9971133028626413, 0.864423076923077)

In [14]:
# GridSearchCV --- > 하이퍼 파라메터 튜닝....  model_selection
from sklearn.model_selection import cross_validate
score =  cross_validate(dc,x_train, y_train)
score

{'fit_time': array([0.00698042, 0.00698304, 0.0069809 , 0.00600886, 0.0059824 ]),
 'score_time': array([0.0009973 , 0.00199366, 0.00199485, 0.0010016 , 0.00099802]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [17]:
score['test_score']

array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])

In [18]:
score['test_score'].mean()

0.855300214703487

In [20]:
import numpy as np
np.mean(score['test_score'])

0.855300214703487

# 교차검증의 장점
## 모든데이터 셋을 훈련과 평가에 활용
## 정확도를 향상
## 데이터가 부족할때 이로인한 과적합
## 데이터 편중을 막을수 있다
## 좀더 일반화된 모델을

# 단점
## 시간이 과다하게 소요된다.

In [24]:
# Kfold 방식되 거의 유사 
# 분류기를 통한 교차검증
from sklearn.model_selection import StratifiedKFold
score =  cross_validate(dc,x_train, y_train, cv=StratifiedKFold() )
score['test_score'].mean()

0.855300214703487

In [27]:
dc.feature_names_in_, dc.feature_importances_

(array(['alcohol', 'sugar', 'pH'], dtype=object),
 array([0.23614177, 0.50084785, 0.26301038]))

In [33]:
dc.feature_importances_

array([0.23614177, 0.50084785, 0.26301038])

In [35]:
np.sort(dc.feature_importances_)[::-1]

array([0.50084785, 0.26301038, 0.23614177])

In [36]:
# 결정트리의 사용 이유 : 정확한 분류..
# 부가적인 기능 : 피어의 중요도를 알수 있다..
# 만약에 피처가 너무 많다면.... feature_importances_의 값을 내람차순으로 정렬한후 상위 몇개의 feature만 추출해서
# 원하는 머신러닝에 적용

In [41]:
# 교차검증으로 과적합을 피할수 있다면..... 성능을 높이기 위해서 하이퍼 파라메터 튜닝을통해 성능 확인
dc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [46]:
p ={
    'min_impurity_decrease':[0.0001,0.001,0.01,0.1,1,10,100]
}

In [47]:
from sklearn.model_selection import GridSearchCV

In [50]:
gs = GridSearchCV(dc, param_grid=p)

In [51]:
gs.fit(x_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'min_impurity_decrease': [0.0001, 0.001, 0.01, 0.1, 1,
                                                   10, 100]})

In [53]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [54]:
model = gs.best_estimator_

In [55]:
model.score(x_target2,y_target2)

0.9673076923076923

In [57]:
model.score(x_train2, y_train2), model.score(x_target2, y_target2)

(0.9600673562665384, 0.9673076923076923)

In [63]:
params = {
    'min_impurity_decrease' : np.arange(0.0001, 0.1,0.0001),
    'max_depth' : np.arange(1,10),
    'min_samples_leaf': np.arange(1,10),
    'min_samples_split': np.arange(1,10)
}

In [None]:
gs = GridSearchCV(dc, param_grid=params)
gs.fit(x_train,y_train)
gs.best_params_