In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, roc_curve

# 이진 분류 모델의 성능지표
def get_eval_score(test_y, pred, pred_proba_c1= None):
    
    # 혼동행렬(오차행렬)
    confusion = confusion_matrix(test_y, pred)
    # 정밀도(precision)
    precision = precision_score(test_y, pred)
    # 정확도(accuracy_score)
    accuracy = accuracy_score(test_y, pred)
    # 재현율(recall)
    recall = recall_score(test_y, pred)
    # F1 score
    f1 = f1_score(test_y, pred)
    # G-measure -> 정밀도와 재현율의 기하평균 -> np.sqrt(recall_socre*precision_score)
    g = np.sqrt(recall_score(test_y, pred)*precision_score(test_y, pred))

    print(f'confusion matrix:\n{confusion}\n')
    print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}',end=' ')
    print(f'F1: {f1:.4f}, G: {g:.4f}')
    if pred_proba_c1 is not None:
        auc = roc_auc_score(test_y, pred_proba_c1)
        print(f'auc: {auc:.4f}')

def get_eval_score2(test_y, pred, pred_proba = None):
    
    # 혼동행렬(오차행렬)
    confusion = confusion_matrix(test_y, pred)
    # 정밀도(precision)
    precision = precision_score(test_y, pred, average='macro')
    # 정확도(accuracy_score)
    accuracy = accuracy_score(test_y, pred)
    # 재현율(recall)
    recall = recall_score(test_y, pred, average='macro')
    # F1 score
    f1 = f1_score(test_y, pred, average='macro')
    # G-measure -> 정밀도와 재현율의 기하평균 -> np.sqrt(recall_socre*precision_score)
    g = np.sqrt(recall * precision)

    print(f'confusion matrix:\n{confusion}\n')
    print(f'accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}',end=' ')
    print(f'F1: {f1:.4f}, G: {g:.4f}')
    if pred_proba is not None:
        auc = roc_auc_score(test_y, pred_proba, average='macro', multi_class='ovo')
        print(f'auc: {auc:.4f}')
    print()

In [4]:
# 이진 분류
# 결측치 처리
def fill_na(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 레이블 인코딩
def encode_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    for ftr in ['Sex','Cabin','Embarked']: 
        encoder = LabelEncoder()
        encoder.fit(df[ftr])
        df[ftr] = encoder.transform(df[ftr])
    return df

# 원핫인코딩
def encode_features2(df):
    df['Cabin'] = df['Cabin'].str[:1]
    ftrs = ['Sex','Cabin','Embarked']
    dummy = pd.get_dummies(df[ftrs], dtype='int')
    df.drop(ftrs, axis=1,inplace=True)
    df = pd.concat([df, dummy], axis=1)
    return df
    
# 불필요 열 삭제
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 전처리 함수 / 라벨인코딩 적용
def preprocessing_features(df):
    df = fill_na(df)
    df = drop_features(df)
    df = encode_features(df)
    return df

# 전처리 함수 / 원핫인코딩 적용
def preprocessing_features2(df):
    df = fill_na(df)
    df = drop_features(df)
    df = encode_features2(df)
    return df

-------------------------------------------------

### 연습문제.

1. 타이타닉 데이터셋에 대하여 다음의 예측기들을 사용한 softvoting과 hardvoting을 적용하여 학습하고 성능을 평가하시오.

- 원핫인코딩으로 진행
- KNN: 이웃의 수 =5
- 로지스틱회귀
- 결정트리

In [48]:
def voting(x,y,voting, estimators,n=5):
    # 모델 객체 생성 및 데이터 준비
    KNN = KNeighborsClassifier(n_neighbors=n)
    lr_clf = LogisticRegression(random_state=0)
    dt_clf = DecisionTreeClassifier(random_state=0)
    train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2, random_state=156)
    vo_clf = VotingClassifier(estimators= estimators, voting=voting)
    classes = len(np.unique(y))
    # 학습 / 예측 / 평가
    vo_clf.fit(train_x, train_y)
    pred = vo_clf.predict(test_x)
    print(f'{voting}')
    if classes < 2:
        get_eval_score(test_y, pred)
    else:
        get_eval_score2(test_y, pred)
    print()
    print('-------------------\n')
    
    # 개별 모델 평가
    classifiers = [i[1] for i in estimators]
    for clf in classifiers:
        clf.fit(train_x, train_y)
        pred_y = clf.predict(test_x)
        if classes < 2:
            pred_proba = clf.predict_proba(test_x)[:,1]
            print(clf)
            get_eval_score(test_y, pred_y, pred_proba)
        else:
            pred_proba = clf.predict_proba(test_x)
            print(clf)
            get_eval_score2(test_y, pred_y, pred_proba)
        print('-------------------\n')

In [39]:
# 데이터 로딩 및 전처리, 분리
titanic = pd.read_csv('data/titanic/train.csv')
titanic_cl= preprocessing_features2(titanic)

x = titanic_cl.drop('Survived', axis=1)
y = titanic_cl.Survived

est = [('LR',lr_clf),('KNN', knn_clf),('DT',dt_clf)]

In [49]:
# voting = 'hard'
voting(x,y,voting='hard',estimators=est,n=5)

hard
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000


-------------------

LogisticRegression(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

KNeighborsClassifier()
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

DecisionTreeClassifier(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  8  1]
 [ 0  0 12]]

accuracy: 0.9667, precision: 0.9744, recall: 0.9630 F1: 0.9671, G: 0.9686
auc: 0.9722

-------------------



In [50]:
# voting = 'soft'
voting(x,y,voting='soft',estimators=est, n=5)

soft
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000


-------------------

LogisticRegression(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

KNeighborsClassifier()
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

DecisionTreeClassifier(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  8  1]
 [ 0  0 12]]

accuracy: 0.9667, precision: 0.9744, recall: 0.9630 F1: 0.9671, G: 0.9686
auc: 0.9722

-------------------



2. 붓꽃 데이터셋에 대하여 다음의 예측기들을 사용한 softvoting과 hardvoting을 적용하여 학습하고 성능을 평가하시오.

- KNN: 이웃의 수 = 8
- 로지스틱회귀
- 결정트리

In [53]:
iris = load_iris()

x = iris.data
y = iris.target

est = [('LR',lr_clf),('KNN', knn_clf),('DT',dt_clf)]

In [54]:
# voting = 'hard'
voting(x,y,voting='hard',estimators=est,n=8)

hard
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000


-------------------

LogisticRegression(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

KNeighborsClassifier()
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

DecisionTreeClassifier(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  8  1]
 [ 0  0 12]]

accuracy: 0.9667, precision: 0.9744, recall: 0.9630 F1: 0.9671, G: 0.9686
auc: 0.9722

-------------------



In [52]:
# voting = 'soft'
voting(x,y,voting='soft',estimators=est,n=8)

soft
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000


-------------------

LogisticRegression(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

KNeighborsClassifier()
confusion matrix:
[[ 9  0  0]
 [ 0  9  0]
 [ 0  0 12]]

accuracy: 1.0000, precision: 1.0000, recall: 1.0000 F1: 1.0000, G: 1.0000
auc: 1.0000

-------------------

DecisionTreeClassifier(random_state=0)
confusion matrix:
[[ 9  0  0]
 [ 0  8  1]
 [ 0  0 12]]

accuracy: 0.9667, precision: 0.9744, recall: 0.9630 F1: 0.9671, G: 0.9686
auc: 0.9722

-------------------

