In [8]:
from sklearn.base import BaseEstimator
import numpy as np

In [11]:
class MyDummyClassifier(BaseEstimator):
    def fit(self,x,y):
        pass
    def predict(self,x):
        pred = np.zeros((x.shape[0],1))
        # print(pred)
        for i in range(x.shape[0]):
            if x['Sex'].iloc[i] == 1: #남자일때
                pred[i] = 0 # 사망
            else: #여자일때
                pred[i] = 1 # 생존
        return pred

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(columns=['PassengerId','Name','Ticket'],
            inplace=True)
    return df

def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [6]:
df = pd.read_csv('titanic_train.csv')
y = df['Survived']
x = df.drop(columns='Survived')
x = transform_features(x)
x_train,x_test,y_train,y_test = train_test_split(x,
                                                 y,
                                                 test_size=0.2,
                                                 random_state=11)

In [12]:
myclf = MyDummyClassifier()
myclf.fit(x_train,y_train)
pred = myclf.predict(x_test)
accuracy_score(y_test,pred)

0.8324022346368715

In [13]:
from sklearn.metrics import confusion_matrix

In [14]:
confusion_matrix(y_test,pred)

array([[103,  15],
       [ 15,  46]], dtype=int64)

In [18]:
def get_clf_eval(y_test,pred):
    from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도:{accuracy:.4f},정밀도:{precision:.4f},재현율:{recall:.4f}')

In [19]:
get_clf_eval(y_test,pred)

오차행렬
[[103  15]
 [ 15  46]]
정확도:0.8324,정밀도:0.7541,재현율:0.7541


In [20]:
from sklearn.linear_model import LogisticRegression

In [24]:
lr_clf = LogisticRegression(max_iter=200)
lr_clf.fit(x_train,y_train)
pred = lr_clf.predict(x_test)
get_clf_eval(y_test,pred)

오차행렬
[[104  14]
 [ 13  48]]
정확도:0.8492,정밀도:0.7742,재현율:0.7869


In [27]:
pred_proba = lr_clf.predict_proba(x_test)
np.concatenate([pred_proba,pred.reshape(-1,1)],axis=1)

array([[0.46225822, 0.53774178, 1.        ],
       [0.8787576 , 0.1212424 , 0.        ],
       [0.87718656, 0.12281344, 0.        ],
       [0.88246099, 0.11753901, 0.        ],
       [0.85525789, 0.14474211, 0.        ],
       [0.8821376 , 0.1178624 , 0.        ],
       [0.88849152, 0.11150848, 0.        ],
       [0.20874993, 0.79125007, 1.        ],
       [0.78280989, 0.21719011, 0.        ],
       [0.36908393, 0.63091607, 1.        ],
       [0.89970505, 0.10029495, 0.        ],
       [0.87511886, 0.12488114, 0.        ],
       [0.87718062, 0.12281938, 0.        ],
       [0.88844595, 0.11155405, 0.        ],
       [0.4369463 , 0.5630537 , 1.        ],
       [0.8590793 , 0.1409207 , 0.        ],
       [0.9037593 , 0.0962407 , 0.        ],
       [0.73330718, 0.26669282, 0.        ],
       [0.72472845, 0.27527155, 0.        ],
       [0.1714978 , 0.8285022 , 1.        ],
       [0.75348128, 0.24651872, 0.        ],
       [0.61884699, 0.38115301, 0.        ],
       [0.

In [28]:
from sklearn.preprocessing import Binarizer

In [30]:
x = [[1,-1,2],
     [2,0,0],
     [0,1.1,1.2]]
binarizer = Binarizer(threshold=1.1)
binarizer.fit_transform(x)

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [37]:
binarizer = Binarizer(threshold=0.6)
pred_1 = binarizer.fit_transform(pred_proba[:,1].reshape(-1,1))

In [38]:
get_clf_eval(y_test,pred_1)
# 오차행렬
# [[104  14]
#  [ 13  48]]
# 정확도:0.8492,정밀도:0.7742,재현율:0.7869

# 0.4일때
# 오차행렬
# [[98 20]
#  [10 51]]
# 정확도:0.8324,정밀도:0.7183,재현율:0.8361

오차행렬
[[112   6]
 [ 16  45]]
정확도:0.8771,정밀도:0.8824,재현율:0.7377
