# 분류기 만들기

In [30]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## 타이타닉 데이터의 생존여부 분류
- 규칙 : 성별(sex) = 1 생존하지 않은 것으로 분류

In [3]:
from sklearn.base import BaseEstimator
import numpy as np
class MyDummyClassifier(BaseEstimator):
    def fit(self, x, y):
        pass
    
    def predict(self, x):
        pred = np.zeros((x.shape[0],1))
        for i in range(x.shape[0]):
            if x['Sex'].iloc[i] == 1:
                pred[i]=0
            else :
                pred[i]=1
        return pred
    

## 타이타닉 데이터 가져오기

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
titanic_df = pd.read_csv('./data/titanic.csv')
y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)

In [5]:
from sklearn.preprocessing import LabelEncoder

#Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

#머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

#레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

#앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [6]:
x_titanic_df = transform_features(x_titanic_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [7]:
# 데이터셋 분할
x_train, x_test, y_train, y_test = train_test_split(x_titanic_df,
                                                    y_titanic_df,
                                                    test_size=0.2,
                                                    random_state=0)

In [8]:
myclf = MyDummyClassifier()
myclf.fit(x_train,y_train)

In [9]:
my_pred = myclf.predict(x_test)
accuracy_score(y_test, my_pred)

0.7877094972067039

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, my_pred)

array([[92, 18],
       [20, 49]])

In [11]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test,my_pred)

(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

# 로지스틱회귀, 랜덤포레스트, KNN의 정밀도, 재현율 비교하기

In [12]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('*'*20)
    print(accuracy, precision, recall)

In [13]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(x_train,y_train)
pred = lr_clf.predict(x_test)

#정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
********************
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [14]:
pred_proba = lr_clf.predict_proba(x_test)
pos_proba = pred_proba[:,1] # 양성클래스일 확률

threshold = 0.4 #임계치
custom_proba = (pos_proba>=threshold).astype(int) # 임계치보다 크면 1
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
********************
0.7932960893854749 0.7 0.8115942028985508


# 정밀도와 재현율의 변화
정밀도와 재현율의 불균형이 심할 때,
혹은 비즈니스의 요구사항이 있을 때
임계치를 조정해야한다.

임계치를 낮추면, 정밀도는 낮아지고, 재현율은 올라간다.

In [15]:
from sklearn.metrics import f1_score,classification_report
f1_score(y_test, pred)

np.float64(0.7571428571428571)

In [16]:
print(classification_report(y_test,pred))   # 평가보고서

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.75      0.77      0.76        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [17]:
pd.Series(lr_clf.coef_[0]).sort_values()

1   -2.593416
0   -0.901628
3   -0.368137
7   -0.107352
4   -0.059052
6   -0.058762
2   -0.042756
5    0.001286
dtype: float64