### sklearn.metrics.roc_curve
sklearn.metrics.roc_curve(_y_true_, _y_score_, _*_, _pos_label=None_, _sample_weight=None_, _drop_intermediate=True_)[[site Link]](https://github.com/scikit-learn/scikit-learn/blob/7db5b6a98/sklearn/metrics/_ranking.py#L904)[¶](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve "Permalink to this definition")



Returns:

**fpr**ndarray of shape (>2,)

Increasing false positive rates such that element i is the false positive rate of predictions with score >=  `thresholds[i]`.

**tpr**ndarray of shape (>2,)

Increasing true positive rates such that element  `i`  is the true positive rate of predictions with score >=  `thresholds[i]`.

**thresholds**ndarray of shape = (n_thresholds,)

Decreasing thresholds on the decision function used to compute fpr and tpr.  `thresholds[0]`  represents no instances being predicted and is arbitrarily set to  `max(y_score)  +  1`.


thresholds : 기준값

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('../ml/titanic_train.csv')
test_df = pd.read_csv('../ml/titanic_test.csv')
y_test = pd.read_csv('../ml/gender_submission.csv')
y_test = y_test['Survived']

# Null 처리 함수. 
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df
# 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket','SibSp','Parch','Fare'], axis=1, inplace = True)
    return df

# 레이블 인코딩
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        LE = LabelEncoder()
        LE = LE.fit(df[feature])
        df[feature] = LE.transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing  함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

train_df = transform_features(train_df)
test_df = transform_features(test_df)

# label, features
x_train = train_df.drop("Survived", axis =1)
y_train = train_df['Survived']
x_test = test_df

## logistic regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
pred_proba = lr.predict_proba(x_test)





In [20]:
from sklearn.metrics import f1_score
import numpy as np

f1 = f1_score(y_test, pred)
print(f1)

0.9287925696594427


In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# 레이블 값이 1일 때 예측 확률 축출
pred_proba_class1 = lr.predict_proba(x_test)[:,1]

fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)
# 반환된 임계값 배열에서 샘플로 데이터를 축출하되, 임계값을 5step으로 축출. 
# thresholds[0]은 max(예측확률) +1로 임의 설정됨. 이를 제외하기 위해 np.arange는 1부터 시작된다. 

thr_index = np.arange(1, thresholds.shape[0],5)
print('샘플 index로 추출한 임계값:', np.round(thresholds[thr_index],2),"\n")
## shape는 모양을 나타낸다. 

# 임계값에 따른 FPR, TPR
print('fpr:', np.round(fprs[thr_index],2),"\n")
print('tpr:', np.round(tprs[thr_index],2))

