In [None]:
# ref : https://www.kaggle.com/uciml/pima-indians-diabetes-database?select=diabetes.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# 1. 데이터 사전 탐색(EDA)

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

* Pregnancies : 임신횟수
* Glucose : 포도당수치
* BloodPressure : 혈압
* SkinThickness : 피하지방
* Insulin : 인슐린
* BMI : 체질량지수
* DiabetesPedigreeFunction : 유전적(가족력)
* Age : 나이
* Outcome : 당뇨여부 1(당뇨) / 0(정상)

## (결측없다, 수치형피쳐)

In [None]:
def myfit(df_X, df_y, model, imp=0, tsize=0.2):
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=tsize, random_state=36,  shuffle=False)
    #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(pred ,  y_test)
    precision = precision_score(pred ,  y_test)
    recall = recall_score(pred ,  y_test)
    f1 = f1_score(pred ,  y_test)
    print("Accuracy:{:.6f}  precision:{:.6f}, recall:{:.6f}, f1:{:.6f}".format(accuracy, precision, recall, f1))
    
#     if imp==1:
#         imp_df = pd.DataFrame({"featuer" : X_train.columns, "importance" : model.feature_importances_})
#         imp_df = imp_df.sort_values('importance', ascending=False).T
#         print(imp_df)

In [None]:
rf = RandomForestClassifier(random_state=36)
df_y = df["Outcome"]
df_X = df.drop("Outcome", axis=1)
print(df_X.shape, df_y.shape)

In [None]:
myfit(df_X, df_y, rf, imp=1)   # 0.720779

## 불균형 확인

In [None]:
df["Outcome"].value_counts()

In [None]:
df.hist()
plt.show()

### 0데이터 확인

In [None]:
nan_dict = {"CNT":df.isin([0]).sum(),
            "RATE": df.isin([0]).sum()/df.shape[0] *100
}
nan_df = pd.DataFrame(nan_dict)
print(nan_df[nan_df["RATE"]>0].sort_values("CNT", ascending=False))


* Insulin        374  48.697917
* SkinThickness  227  29.557292  --??
* BloodPressure   35   4.557292
* BMI             11   1.432292
* Glucose          5   0.651042

### 각 피쳐당 0값을 평균값으로 대체

In [None]:
zero_feature = ["Insulin","SkinThickness","BloodPressure","BMI","Glucose"]
zero_mean = df[zero_feature].mean()
df[zero_feature] = df[zero_feature].replace(0, zero_mean)

In [None]:
df.hist()
plt.show()

In [None]:
df.head()  

In [None]:
def myscore(y_test,pred,proba, pr_curve=0, auc_curve=0):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, proba[:,-1])
    #print(f'StandardScaler:  {df_score:.4f}')  # 0.766234
    #print(f"Accuracy:{accuracy:.6f}  precision:{precision:.6f}, recall:{recall:.6f}, f1:{f1:.6f}")
    print("Accuracy:{:.6f}  precision:{:.6f}, recall:{:.6f}, f1:{:.6f}, auc:{:.6f}".format(accuracy, precision, recall, f1, auc))
    
    mtx = confusion_matrix(y_test, pred)
    print(mtx)
    
    if pr_curve==1:
        mycurve(y_test, proba)
    if auc_curve==1:
        mycurve_auc(y_test, proba)
        

In [None]:
from sklearn.metrics import precision_recall_curve
def mycurve(y_test, proba):  
    precision, recall, thresholds = precision_recall_curve(y_test, proba[ : , -1])
    print(len(precision), len(recall), len(thresholds))  #66 66 65
    plt.plot(thresholds, precision[:len(thresholds)], label="precision")
    plt.plot(thresholds, recall[:len(thresholds)], label="recall")
    plt.xlabel("thresholds")
    plt.ylabel("score")
    plt.grid()
    plt.legend()
    plt.show()
    
    

In [None]:
def mycurve_auc(y_test, proba):  
    fpr, tpr, thresholds = roc_curve(y_test, proba[:,-1])
    print(len(fpr), len(tpr), len(thresholds))  #66 66 65
    plt.plot(fpr, tpr, label="roc")
    plt.plot([0,1], [0,1], label="th:0.5")
    plt.xlabel("FPR (1-TNR(specificity)")      #FP
    plt.ylabel("TPR (recall,sensitivity)") #TP
    plt.title(f"auc : {roc_auc_score(y_test, proba[:,-1]):.4f}")
    plt.grid()
    plt.legend()
    plt.show()

## 스케일링 : 정규화

### 방법1) df_X.splite -->  train.fit_transform  --> test.transform

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
df_y = df["Outcome"]
df_X = df.drop("Outcome", axis=1)
     
std_scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=36,  shuffle=False)
# print(X_train.shape, X_test.shape, len(y_train), len(y_test))

std_scaler = StandardScaler()
X_train_scaler = std_scaler.fit_transform(X_train)  
X_test_scaler  = std_scaler.transform(X_test)    #---------------------스케일링 1/10 적용

rf.fit(X_train_scaler, y_train)
pred = rf.predict(X_test_scaler)
proba= rf.predict_proba(X_test_scaler)

#-------------------------------------------------
# print(pred[:5], proba[:5])
myscore(y_test,pred, proba, pr_curve=1, auc_curve=1)
          
    

* ROC곡선 : FPR 0~1변화에 따른 TPR 변환율
* FPR=0인 경우 -->  FP=0 ,  th=1(all N : 임계값이 1이면 P예측이 아예 없다)
* FPR=1인 경우 -->  TN=0,   th=0(all P : 임계값이 0이면 P예측이 전부이다) 임계값이 낮을수록 P로 예측할 확률이 높아짐)

In [None]:
from sklearn.preprocessing import Binarizer
ths = [0.1, 0.35, 0.4 , 0.45, 0.5, 0.55, 0.99] #0.5
for th in ths: 
    binarizer = Binarizer(threshold=th)
    pred = binarizer.fit_transform(proba[:,-1].reshape(-1,1))
    print(f'N:P {th, 1-th}')
    myscore(y_test,pred, proba)
    

### 방법2)  df_X.fit_transform --> train_test_split 

In [None]:
# std_scaler = StandardScaler()
# rbs_scaler = RobustScaler()
# mmx_scaler = MinMaxScaler()
# scaler_list = [("StandardScaler",std_scaler), ("RobustScaler",rbs_scaler), ("MinMaxScaler",mmx_scaler)]
# df_y = df["Outcome"]
# df_X = df.drop("Outcome", axis=1)
# for scaler in scaler_list:
#     df_X_scaler = scaler[1].fit_transform(df_X)   
#     X_train, X_test, y_train, y_test = train_test_split(df_X_scaler, df_y, test_size=0.2, random_state=36,  shuffle=False)
#     rf.fit(X_train, y_train)
#     pred = rf.predict(X_test)
#     df_score = accuracy_score(pred ,  y_test)
#     print(f'{scaler[0]:s}:  {df_score:.4f}')


In [None]:
# df_y = df["Outcome"]
# df_X = df.drop("Outcome", axis=1)
# std_scaler = StandardScaler()
# df_X_scaler = std_scaler.fit_transform(df_X)   
# X_train, X_test, y_train, y_test = train_test_split(df_X_scaler, df_y, test_size=0.2, random_state=36,  shuffle=False)
# rf.fit(X_train, y_train)
# pred = rf.predict(X_test)
# myscore(y_test,pred)