In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

titanic_df = pd.read_csv('./datasets/titanic_train.csv')
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## sklearn.metrics.roc_curve
* sklearn.metrics.roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True)

    Returns:
    fpr:ndarray of shape (>2,)
    Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i].

    tpr:ndarray of shape (>2,)
    Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i].

    thresholds:ndarray of shape = (n_thresholds,)
    Decreasing thresholds on the decision function used to compute fpr and tpr. thresholds[0] represents no instances being predicted and is arbitrarily set to max(y_score) + 1.

In [2]:
def fillna_features(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId', 'Name', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],axis=1,inplace=True)
    return df

def format_features(df):
    encoder = LabelEncoder()
    encoder.fit(df['Sex'])
    df['Sex'] = encoder.transform(df['Sex'])
    return df

def transform_features(df):
    df = fillna_features(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [3]:
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived',axis=1)
X_titanic_df = transform_features(X_titanic_df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=11)

In [4]:
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train , y_train)
pred_proba_class1 = lr_clf.predict_proba(X_test)[:,1]
#roc_auc 평가 위해서 양성인 곳이 0과 1 중에 1에 있음을 확인해서
#[:,1]로 함

In [None]:
lr_clf.predict_proba(X_test)

In [None]:
y_test

In [None]:
fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)

In [None]:
thresholds #thresholds[0]은 max(y_score)+1 값임.

In [None]:
def roc_curve_plot(y_test, pred_proba_c1):
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba_c1)

    plt.plot(fprs, tprs, label='ROC')
    plt.plot([0,1],[0,1], 'k--', label='Random') #k--는 검은색

    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1);plt.ylim(0,1)
    plt.xlabel('FPR');plt.ylabel('TPR(Recall)')
    plt.legend()
    plt.show()

roc_curve_plot(y_test, lr_clf.predict_proba(X_test)[:,1])

## sklearn.metrics.roc_auc_score
* sklearn.metrics.roc_auc_score(y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None)

In [None]:
from sklearn.metrics import roc_auc_score
roc_score = roc_auc_score(y_test, pred_proba_class1)
roc_score