In [2]:
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator): #BaseEstimator 클래스의 상속으로 커스텀 Estimator를 제작할 수 있음
    
    def fit(self, X, y=None):
        pass
    
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        for i in range (X.shape[0]):
            if X['Sex'].iloc[i] == 1 :
                pred[i] = 0
            else :
                pred[i] = 1
        return pred

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        encoder = LabelEncoder()
        encoder = encoder.fit(df[feature])
        df[feature] = encoder.transform(df[feature])
     
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('../../data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop('Survived',axis=1)
X = transform_features(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [6]:
my_classifier = MyDummyClassifier()
my_classifier.fit(X_train,y_train)

predictions = my_classifier.predict(X_test)
print('Dummy Classifier의 정확도 : {0:.4f}'.format(accuracy_score(y_test,predictions)))

Dummy Classifier의 정확도 : 0.7877


In [7]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self,X):
        return np.zeros( (len(X),1), dtype=bool)


In [8]:
digits = load_digits()
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data,y, random_state=11)

In [9]:
pd.Series(y_test).value_counts()

0    405
1     45
Name: count, dtype: int64

In [10]:
fake_clf = MyFakeClassifier()
fake_clf.fit(X_train,y_train)

In [11]:
fakepred = fake_clf.predict(X_test)
print('모든 예측을 0으로 하여도 정확도는: {:0.4f}'.format(accuracy_score(fakepred,y_test)))


모든 예측을 0으로 하여도 정확도는: 0.9000


In [12]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,fakepred)

array([[405,   0],
       [ 45,   0]], dtype=int64)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print(f'정확도: {accuracy} , Precision :,{precision} recall : ,{recall }')


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

titanic_df = pd.read_csv('../../data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop('Survived',axis=1)
X = transform_features(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=11)
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train,y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test,pred)

오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659217877094972 , Precision :,0.8245614035087719 recall : ,0.7704918032786885


In [15]:
pred_proba = lr_clf.predict_proba(X_test)
pred = lr_clf.predict(X_test)
print('pred_proba()결과 shape : {}'.format(pred_proba.shape))
print(pred_proba[:3])

pred_proba()결과 shape : (179, 2)
[[0.44935227 0.55064773]
 [0.86335512 0.13664488]
 [0.86429645 0.13570355]]


In [16]:
from sklearn.preprocessing import Binarizer

X = [[-1,-1,2],
     [2,0,0],
     [0,1.1,1.2]]


# X의 개별 원소들이 threshold값보다 같거나 작으면 0을, 크면 1을 반환
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(X))   

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [19]:
from sklearn.preprocessing import Binarizer

custom_threshold = 0.5
pred_proba_1 = pred_proba[:,1].reshape(-1,1)  #positive class의 확률만 추출
binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_test,custom_predict)

오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659217877094972 , Precision :,0.8245614035087719 recall : ,0.7704918032786885


predict() 는 predict_proba()에 기반함을 알 수 있다.

threshold 변경

In [23]:
custom_threshold = 0.4
pred_proba_1 = pred_proba[:,1].reshape(-1,1)  #positive class만 취한다.
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)  
custom_predict = binarizer.transform(pred_proba_1) #threshold기반하여 0,1 분류(predict)

get_clf_eval(y_test, custom_predict)  #precision, recall trade-off가 발생하는것을 알 수 있음. (threshold를 낮추면 recall은 상승하지만 precision은 감소)

오차 행렬
[[97 21]
 [11 50]]
정확도: 0.8212290502793296 , Precision :,0.704225352112676 recall : ,0.819672131147541


In [28]:
#threshold를 0.4에서 0.6까지 0.05증가시키며 평가지표 조사
thresholds = [0.4,0.45, 0.50, 0.55, 0.60]

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for threshold in thresholds:
        binarizer = Binarizer(threshold=threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        get_clf_eval(y_test, custom_predict)

get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)

임곗값: 0.4
오차 행렬
[[97 21]
 [11 50]]
정확도: 0.8212290502793296 , Precision :,0.704225352112676 recall : ,0.819672131147541
임곗값: 0.4
오차 행렬
[[105  13]
 [ 13  48]]
정확도: 0.8547486033519553 , Precision :,0.7868852459016393 recall : ,0.7868852459016393
임곗값: 0.4
오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659217877094972 , Precision :,0.8245614035087719 recall : ,0.7704918032786885
임곗값: 0.4
오차 행렬
[[111   7]
 [ 16  45]]
정확도: 0.8715083798882681 , Precision :,0.8653846153846154 recall : ,0.7377049180327869
임곗값: 0.4
오차 행렬
[[113   5]
 [ 17  44]]
정확도: 0.8770949720670391 , Precision :,0.8979591836734694 recall : ,0.7213114754098361


In [40]:
#precision_recall_curve API를 이용해봅니다.
from sklearn.metrics import precision_recall_curve

pred_proba_class1 = lr_clf.predict_proba(X_test)[:,1]  

precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_class1)
print(thresholds.shape)


(165,)
