# Configuration

In [None]:
# linear algebra
import numpy as np

# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# For ML Model & Metrics & Model Selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Data Load & Preprocessing

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
print(df.Outcome.value_counts())
df.head(3)

In [None]:
df.info()

In [None]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156)

# 평가 지표 Functions

1. `get_clf_eval`

In [None]:
def get_clf_eval(y_test, pred) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, f1-score: {3: .4f}'.format(accuracy, precision, recall, f1))

2. `get_eval_by_threshold`

In [None]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds) :
    for custom_threshold in thresholds :
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('Threshold: ', custom_threshold)
        get_clf_eval(y_test, custom_predict)

3. `precision_recall_curve_plot`

In [None]:
def precision_recall_curve_plot(y_test, pred_proba_c1) :
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    plt.figure(figsize=(8, 6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')
    
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))
    
    plt.xlabel('Threshold Value')
    plt.ylabel('Precision and Recall Curve')
    plt.legend()
    plt.grid()
    plt.show()

# 로지스틱 회귀로 학습 후 평가 지표 확인

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

get_clf_eval(pred, y_test)

In [None]:
pred_proba_c1 = lr_clf.predict_proba(x_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)

임계값을 **0.42**정도로 설정하면 **Recall**과 **Precision**이 적당히 균형을 이루지만 0.7정도의 값으로 그닥 만족스러운 결과는 아님. 즉, 데이터의 분포를 다시 확인해보고 재탐색을 진행함.

In [None]:
df.describe()

**Glucose**의 최솟값이 0이 될 수가 없음. 이 수치를 시각화를 통해 확인해봄.

In [None]:
plt.hist(df.Glucose, bins=10)

**min**값이 0인 칼럼에 대한 퍼센트를 확인하고 따로 처리를 해줌.

In [None]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

total_count = df.Glucose.count()

for feature in zero_features :
    zero_count = df[df[feature] == 0][feature].count()
    print('{0} Zero Percentage : {1: .2f}%'.format(feature, 100 * zero_count / total_count))

In [None]:
mean_zero_features = df[zero_features].mean()
df[zero_features] = df[zero_features].replace(0, mean_zero_features)

# Data Scaling & Retry

In [None]:
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, Y, test_size=0.2, random_state=156)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

get_clf_eval(y_test, pred)

In [None]:
thresholds = [0.3, 0.33, 0.36, 0.39, 0.42, 0.45, 0.48, 0.50]
pred_proba = lr_clf.predict_proba(x_test)
get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)

In [None]:
## 임계값 0.33으로 설정
binarizer = Binarizer(threshold=0.33)
pred_th_033 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1, 1))
get_clf_eval(y_test, pred_th_033)