# 분류분석
- 실제 분류와 예측 분류가 얼마나 일치하는가를 기반으로 알고리즘 성능 평가
## 정확도
- 실제 데이터에서 예측 데이터가 얼마나 같은지 판단하는 지표
- 데이터 구성에 따라 머신러닝 모델의 성능을 왜곡할 가능성 존재

In [85]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings 

In [86]:
warnings.filterwarnings('ignore')

In [87]:
iris_data = load_iris()

df = pd.DataFrame(iris_data['data'], columns=iris_data['feature_names'])

In [88]:
df['class'] = iris_data['target']
df.head(1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   class              150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [90]:
# train, test 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(
    df.drop('class', axis=1),
    df['class'],
    test_size=0.3,
    stratify=df['class'],
    random_state=42
)

In [91]:
# 분류분석 모델 : 의사결정나무
from sklearn.tree import DecisionTreeClassifier

In [92]:
dtree_7 = DecisionTreeClassifier(max_depth=7, random_state=42)
dtree_5 = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree_3 = DecisionTreeClassifier(max_depth=3, random_state=42)
dtree_1 = DecisionTreeClassifier(max_depth=1, random_state=42)

In [93]:
type(dtree_1)

sklearn.tree._classes.DecisionTreeClassifier

In [94]:
# 교차검증
from sklearn.model_selection import cross_val_score

- 사용할 모델
- cv : 반복 횟수
- scoring : 확인할 평가지표

In [95]:
scores = cross_val_score(dtree_7, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [1.    0.909 1.    0.909 1.    0.8   0.9   0.8   1.    1.   ]
평균검증의 정확도:  0.9318181818181819


In [96]:
scores = cross_val_score(dtree_5, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [1.    0.909 1.    0.909 1.    0.8   0.9   0.8   1.    1.   ]
평균검증의 정확도:  0.9318181818181819


In [97]:
scores = cross_val_score(dtree_3, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [1.    0.909 1.    0.909 1.    0.9   1.    0.8   1.    1.   ]
평균검증의 정확도:  0.9518181818181819


In [98]:
scores = cross_val_score(dtree_1, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [0.636 0.636 0.636 0.636 0.636 0.7   0.7   0.7   0.7   0.7  ]
평균검증의 정확도:  0.6681818181818182


In [99]:
# 깊이가 3인 트리가 적합(정확도 최대)
# 학습
dtree_3.fit(x_train, y_train)

In [100]:
# 예측
pred = dtree_3.predict(x_test)

In [101]:
# 실제값 예측값 비교
pd.concat(
    [y_test.reset_index(drop=True), pd.Series(pred)],
    axis=1
)

Unnamed: 0,class,0
0,2,2
1,1,1
2,2,2
3,1,1
4,2,2
5,2,2
6,1,1
7,1,1
8,0,0
9,2,2


In [102]:
# 예측 정확도를 계산하는 함수 로드
from sklearn.metrics import accuracy_score

In [103]:
accuracy_score(y_test, pred)

0.9777777777777777

### 분류분석 연습
1. sklearn에 있는 datasets에서 wine 데이터를 불러온다
2. train, test로 데이터 분할 (75, 25)
3. 교차검증을 이용해 깊이가 1, 3, 5인 의사결정나무 정확도 평균을 구한다
4. 정확도가 높은 깊이를 선택해서 학습 -> 예측
5. 정확도 계산해서 출력
6. y_test와 pred의 값들은 데이터프레임으로 확인

In [104]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [105]:
wine_load = load_wine()

In [106]:
df = pd.DataFrame( wine_load['data'], columns=wine_load['feature_names'])

In [107]:
target_data = wine_load['target']
# df['class']가 target_data

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [109]:
# train, test 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(
    df,
    target_data,
    test_size=0.25,
    stratify=target_data,
    random_state=42
)

In [110]:
dtree_5 = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree_3 = DecisionTreeClassifier(max_depth=3, random_state=42)
dtree_1 = DecisionTreeClassifier(max_depth=1, random_state=42)

In [111]:
scores = cross_val_score(dtree_5, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [1.    0.929 0.786 0.846 1.    0.769 0.923 0.923 0.923 0.769]
평균검증의 정확도:  0.8868131868131869


In [112]:
scores = cross_val_score(dtree_3, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [1.    1.    0.786 0.769 1.    0.769 0.923 0.923 0.923 0.846]
평균검증의 정확도:  0.893956043956044


In [113]:
scores = cross_val_score(dtree_1, x_train, y_train, cv=10, scoring='accuracy')
print('교차검증의 정확도: ', np.round(scores, 3))
print('평균검증의 정확도: ', np.mean(scores))

교차검증의 정확도:  [0.714 0.643 0.571 0.538 0.769 0.538 0.692 0.615 0.615 0.538]
평균검증의 정확도:  0.6236263736263735


In [114]:
# 깊이가 3인 트리가 적합(정확도 최대)
# 학습
dtree_3.fit(x_train, y_train)

In [115]:
# 예측
pred = dtree_3.predict(x_test)

In [116]:
accuracy_score(y_test, pred)

0.9555555555555556

In [117]:
pred

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 2, 1, 2, 2, 1, 0, 2, 1, 1, 2, 1, 0, 1,
       1])

In [118]:
y_test

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 2, 1, 2, 2, 1, 0, 2, 2, 1, 2, 1, 0, 1,
       1])

In [119]:
# 실제값 예측값 비교
pd.concat(
    [
        pd.Series(y_test),
        pd.Series(pred)
    ],
    axis=1
)

Unnamed: 0,0,1
0,0,0
1,1,1
2,0,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,1
8,1,1
9,2,2


In [120]:
df['alcohol']

0      14.23
1      13.20
2      13.16
3      14.37
4      13.24
       ...  
173    13.71
174    13.40
175    13.27
176    13.17
177    14.13
Name: alcohol, Length: 178, dtype: float64

In [121]:
drinks = pd.read_csv('../../csv/drinks.csv')
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [125]:
drinks['continent'].dtype == 'O'

True

In [None]:
for col in drinks.columns:
    if drinks[col].dtype != 'O':		# 컬럼의 dtype이 object가 아닌 경우
        print(col)

beer_servings
spirit_servings
wine_servings
total_litres_of_pure_alcohol


In [141]:
for col in drinks.columns:
    if drinks[col].dtype in ['int64', 'float64']:		# 컬럼의 dtype이 int64이거나 float64인 경우
        print(col)

beer_servings
spirit_servings
wine_servings
total_litres_of_pure_alcohol


In [142]:
# 특정 타입(dtype)의 컬럼(columns)을 선택(select)하는 함수
# select_dtypes()
	# include 매개변수 : 포함시킬 데이터의 타입
	# exclude 매개변수 : 제외할 데이터의 타입

drinks.select_dtypes(include = ['int64', 'float64'])

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9
...,...,...,...,...
188,333,100,3,7.7
189,111,2,1,2.0
190,6,0,0,0.1
191,32,19,4,2.5


In [None]:
# 숫자 형태인 모든 타입(정수, 실수, 허수) 보여줌
drinks.select_dtypes(include=['number'])

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9
...,...,...,...,...
188,333,100,3,7.7
189,111,2,1,2.0
190,6,0,0,0.1
191,32,19,4,2.5
