# 분류기 만들기  
   
타이타닉 데이터의 생존여부 분류   
- 규칙 : 성별(Sex) = 1 생존하지 않은 것으로 분류

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
titanic_df = pd.read_csv('./data/titanic.csv')
# df.head()
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis = 1)

전처리

In [4]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [5]:
X_titanic_df = transform_features(X_titanic_df)

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df
                                                    , y_titanic_df
                                                    , test_size=0.2
                                                    , random_state=0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [6]:
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] = 1
        return pred

In [7]:
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)
acc = accuracy_score(y_test, my_pred)
print(f'정확도 : {acc:.5f}')

정확도 : 0.78771


# 혼동 행렬

In [8]:
# 혼동 행렬
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred)

array([[92, 18],
       [20, 49]])

In [9]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

# 로지스틱회귀, 랜덤포레스트, KNN의 정밀도, 재연률 비교    
정확도(accuracy) : 전체 중 맞춘 비율   
정밀도(precision) : 모델이 Positive로 예측한 값 중 실제로 Positive인 비율, 양성 예측도   
재연률(recall) : 실제 값이 Positive인 데이터 중 실제 값이 Positive로 일치한 데이터의 비율, 즉 실제 Positive 중 모델이 맞춘 비율로 “민감도 Sensitivity” 또는 TPR(True Positive Rate)라고 한다.

In [10]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred) # 혼동행렬
    accuracy = accuracy_score(y_test, pred) # 정확도
    precision = precision_score(y_test, pred) # 정밀도
    recall = recall_score(y_test, pred) # 재현율

    print(confusion)
    print('*'*20)
    print(accuracy, precision, recall)

In [11]:
# 로지스틱회귀 분류모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
********************
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [12]:
# 로지스틱회귀 분류모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
********************
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [13]:
# 랜덤포레스트 분류모델 생성
from sklearn.ensemble import RandomForestClassifier

RF_clf = RandomForestClassifier()
RF_clf.fit(X_train, y_train)
pred = RF_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[100  10]
 [ 21  48]]
********************
0.8268156424581006 0.8275862068965517 0.6956521739130435


In [14]:
# KNN 분류모델 생성
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
pred = knn_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[94 16]
 [31 38]]
********************
0.7374301675977654 0.7037037037037037 0.5507246376811594


In [37]:
pred_proba = lr_clf.predict_proba(X_test)
pos_proba = pred_proba[:,1] # 양성클래스일 확률

threshold = 0.4 # 임계치
custom_proba = (pos_proba >= threshold).astype(int) # 임계치보다 크면 1
custom_proba

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [26]:
pred_proba

array([[0.85354405, 0.14645595],
       [0.89028109, 0.10971891],
       [0.92550402, 0.07449598],
       [0.0579584 , 0.9420416 ],
       [0.32222991, 0.67777009],
       [0.49175727, 0.50824273],
       [0.08753172, 0.91246828],
       [0.06751399, 0.93248601],
       [0.41957946, 0.58042054],
       [0.30545737, 0.69454263],
       [0.90914196, 0.09085804],
       [0.27527566, 0.72472434],
       [0.87931998, 0.12068002],
       [0.09297023, 0.90702977],
       [0.03697431, 0.96302569],
       [0.23659886, 0.76340114],
       [0.85876081, 0.14123919],
       [0.7547604 , 0.2452396 ],
       [0.91048857, 0.08951143],
       [0.6404966 , 0.3595034 ],
       [0.66523293, 0.33476707],
       [0.05638662, 0.94361338],
       [0.87932168, 0.12067832],
       [0.56468347, 0.43531653],
       [0.30323849, 0.69676151],
       [0.10937889, 0.89062111],
       [0.89948781, 0.10051219],
       [0.30380618, 0.69619382],
       [0.17264096, 0.82735904],
       [0.380422  , 0.619578  ],
       [0.

In [16]:
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
********************
0.7932960893854749 0.7 0.8115942028985508


# 정밀도와 재현율의 변화   
   
정미도와 재현율의 불균형이 심할 때,   
혹은 비지니스의 요구사항이 있을 때   
임계치를 조정해야 한다.    
     
임계치를 낮추면, 정밀도는 낮아지고, 재현율은 올라간다.

## 평가 결과 확인하기


In [17]:
from sklearn.metrics import f1_score, classification_report
f1_score(y_test, pred) #정밀도와 재현율의 평균

np.float64(0.6178861788617886)

In [18]:
print(classification_report(y_test, pred))  #평가보고서

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       110
           1       0.70      0.55      0.62        69

    accuracy                           0.74       179
   macro avg       0.73      0.70      0.71       179
weighted avg       0.73      0.74      0.73       179



In [21]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values() # 피처의 중요도는 계수

1   -2.593416
0   -0.901628
3   -0.368137
7   -0.107352
4   -0.059052
6   -0.058762
2   -0.042756
5    0.001286
dtype: float64