In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

### info, target 레이블 value 확인, head(5)

In [None]:
diabetes = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
print(diabetes.info())
print(diabetes['Outcome'].value_counts())
diabetes.head(5)

### 데이터셋을 x(feature)과 y(label)로 분리

- `.iloc[:, :-1]`

### 분리한 데이터셋을 train과 test로 split

In [None]:
x_diabetes = diabetes.iloc[:, :-1]
y_diabetes = diabetes.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x_diabetes, y_diabetes, test_size=0.2, stratify = y_diabetes)

## 로지스틱 회귀를 사용

In [None]:
lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

## 평가

- 오차 행렬, 정확도, 정밀도(NP를 줄임), 재현율(NF를 줄임), f1 : y_test(타겟값)와 pred(예측값)
- roc_auc : y_test(타겟값)와 pred_proba(예측 확률)

In [None]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    
    print('confusion : ')
    print(confusion)
    print('accuracy : {0:.4f}, precision : {1:.4f}, recall : {2:.4f}, f1 : {3:.4f}, roc_auc : {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    
get_clf_eval(y_test, pred, pred_proba)

### 평가 시각화

- precision_recall_curve() 메소드 사용 -> 알아서 threshold를 나누어 계산 후 일차원 ndarray 반환
- 그래프를 그림
- 그래프 부가 설정(legend, grid)

### 분석

- threshold가 약 0.4 정도일때 precision와 recall이 균형을 이룸
- 둘다 0.7을 넘지 못함 -> 성능이 그닥 좋지 않음

In [None]:
def pr_curve_plot(y_test, pred_proba):
    # 일차원 ndarray 반환
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    # 임계값 개수
    threshold_boundary = thresholds.shape[0]
    
    # draw
    plt.figure(figsize=(8,6))
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')
    
    # plot scaling
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))
    plt.xlabel('threshold value')
    plt.ylabel('p/r values')
    plt.legend()  # 범례 표시
    plt.grid()  # 격자 표시
    plt.show()
    
pr_curve_plot(y_test, pred_proba)

## 분석

### Feature info

- Pregnancies : Number of times pregnant
- BloodPressure : Diastolic(이완기) blood pressure
- BMI : Body mass index (weight in kg/(height in m)^2)
- DP(Pedigree, 혈통의)F : function that represents how likely they are to get the disease by extrapolating from their ancestor’s history

### 0값

- Glucose, BloodPressure, SkinThickness, Insulin, BMI가 0인것은 불가능함 -> 정제 필요

In [None]:
diabetes.describe()

In [None]:
cond = diabetes['BMI'] == 0
diabetes.loc[cond, :]

### 0이 들어있는 값들을 정제

In [None]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
mean_features = diabetes[zero_features].mean()
diabetes[zero_features] = diabetes[zero_features].replace(0, mean_features)

cond = diabetes['BMI'] == 0
diabetes.loc[cond, :]

## feature scaling

- StandardScaler : 평균 0 stdev 1


In [None]:
x = diabetes.iloc[:, :-1]
y = diabetes.iloc[:, -1]

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)  # 답 제출할 때 주의

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156, stratify=y)
lr_clf = LogisticRegression(max_iter=8000)
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

get_clf_eval(y_test, pred, pred_proba) # conf matrix, acc, p/r, f1, r_a

## threshold 조정

- threshold를 바꿔가며 학습해서 precision과 recall값을 조정
- Binarizer 객체 사용 -> 확률을 지정한 threshold에 따라 0/1로 바꿔줌

### 분석

- 병 예측은 FN을 줄이는 것이 중요하므로 recall값을 우선시
- threshold가 0.33일때 recall이 유의미하지만 두 값이 극단적임
- 0.48일때 accuracy, f1이 높음

In [None]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test, pred_proba, thresholds):
    for thr in thresholds:
        binarizer = Binarizer(threshold=thr).fit(pred_proba)
        custom_predict = binarizer.transform(pred_proba)
        get_clf_eval(y_test, custom_predict, pred_proba)

thresholds = [0.3 , 0.33, 0.36, 0.39, 0.42, 0.45, 0.48, 0.5]
get_eval_by_threshold(y_test, pred_proba.reshape(-1, 1), thresholds)

In [None]:
binarizer = Binarizer(threshold=0.48)
pred_th_048 = binarizer.fit_transform(pred_proba.reshape(-1, 1))

get_clf_eval(y_test, pred_th_048, pred_proba)

> *권철민, 『파이썬 머신러닝 완벽 가이드』, 위키북스(2019)*