## Pima Indian Diabets Prediction
* Evaluation measure including Precision, Accuracy, recall, F1 Score, ROC_AUC,

## Feature Description
- Pregnancies : 임신 횟수
- Blood Pressure : 혈압
- Glucose : 포도당 부하 수치검사
- Skin Thickness : 팔 삼두근 뒷쪽의 피하지방 측정값(mm)
- Insulin : 혈청 인슐린(mu U/ml)
- BMI : 체질량 지수(체중(kg) / 키(m)^2)
- DiabetesPediggreeFunctions : 당뇨 내력 가중치
- Age: 나이
- Outcome : 클래스 결정 값(0 또는 1)

## Package load

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# split dataset into train and test
from sklearn.model_selection import train_test_split

# eval(accuracy_score, precision_score, recall_score, roc_auc_score,f1_score, confusion_matrix, precision_recall_curve,roc_curve)
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve

# 정규분포를 이용한 전처리 
from sklearn.preprocessing import StandardScaler

# 회귀분석
from sklearn.linear_model import LogisticRegression

## EDA(Exploratory Data Access)

In [None]:
diabets_data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
diabets_data.head(3)

In [None]:
diabets_data.info()

### Technical statistics

In [None]:
# zero value should be replace into some extent of value. e.g) mean, median
# median() should be acceptible.
diabets_data.describe()

### distribution of Label class - feature `Outcome`

In [None]:
diabets_data['Outcome'].value_counts()

### Get Accuracy, Precision, recall 

In [None]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    '''
    Accuracy, Precision, Recall 
    '''
    eval_dict = {}
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    
    # ROC-AUC 
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('confusion matrix')
    print(confusion)
    
    # ROC-AUC print 
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))   
    eval_dict['Accuracy'] = accuracy
    eval_dict['Precision'] = precision
    eval_dict['Recall'] = recall
    eval_dict['F1'] = f1
    eval_dict['ROC AUC'] = roc_auc
    return eval_dict

### Visualization for Precision and Recall

In [None]:
def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    """
    threshold ndarray와 이 threslhold에 따른 정밀도, 재현율 추출 후 시각화 
    """
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    
    plt.figure(figsize = (8, 6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0: threshold_boundary], linestyle = '--', label = 'Precision', color='red')
    plt.plot(thresholds, recalls[0:threshold_boundary], label = 'Recall', color='blue')
    
    start, end = plt.xlim() # 0 과 1을 X-axis, Y-axis
    
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))
    plt.xlabel("threshold")
    plt.ylabel("Precision and Recall")
    plt.legend()
    plt.show()

## Model build for Logistic Regression
- split datasests into train, test
- fit
- predict
- predict_proba

In [None]:
X = diabets_data.iloc[:, :-1]
y = diabets_data.iloc[:,-1]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=.2, random_state=2021)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train) # train

preds = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]

### 

### Accuracy, Precision, Recall

In [None]:
get_clf_eval(y_test, preds, pred_proba)

### Precision-Recall Curve

In [None]:
# precision recall curve
precision_recall_curve_plot(y_test, pred_proba)

* We should have to find the solution to enhance the recall ratio,thus reconsidering the following
* Find the solution to replace zero into some meaningful value

### Distribution for Zero features

In [None]:
zero_features = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

def show_hist_zero_feature(zero_features,df):
    '''
    Show histogram for zero features
    '''
    for zero_feature in zero_features:
        plt.figure(figsize=(8, 6))
        plt.hist(diabets_data[zero_feature], bins = 10)
        plt.title(zero_feature)
    plt.show()

In [None]:
show_hist_zero_feature(zero_features, diabets_data)

In [None]:
zero_features = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
def get_zero_features_rate(zero_features):
    """
    Return DataFrame which have two feature 
    cnt, ratio
    """
    result = {}
    total_count = diabets_data['Glucose'].count()
    
    for zero_feature in zero_features:
        zero_count = diabets_data[diabets_data[zero_feature]==0][zero_feature].count()
        percent = np.round(100 * zero_count / total_count, 2)
        result[zero_feature] = [zero_count, percent]
    result_df = pd.DataFrame(data = result.values(), index = result.keys(), columns = ['Feature Value_0_cnt', 'Feature Value_0_Ratio'])
    return result_df

get_zero_features_rate(zero_features)

## post-processing for zero feature

In [None]:
# 위의 평균값보다는 중앙값(median)으로 대치
median_zero_features = [diabets_data[zero_features].median()]
median_zero_features

In [None]:
diabets_data[zero_features]=diabets_data[zero_features].replace(0, diabets_data[zero_features].median())
display(diabets_data.head(), diabets_data.tail())

In [None]:
diabets_data.describe()

## feature Scaling
* StandardScaler -> fit, transform -> train_test_split -> Logistic Regression

In [None]:
X = diabets_data.iloc[:, :-1]
y = diabets_data.iloc[:, -1]

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=2021, stratify=y)
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

### Recall enhancement through threshold 

In [None]:
from sklearn.preprocessing import Binarizer

In [None]:
def get_eval_by_threshold(y_test, preds, pred_proba, thresholds):
    '''
    Evaluation value enhancement using threshold value
    '''
    result = {}
    for customer_threshold in thresholds:
        binarizer = Binarizer(threshold = customer_threshold)
        binarizer.fit(pred_proba)
        # Binarizer은 일차원의 ndarray타입을 인자로 받아야 하기 때문에 앞단에서 .reshape(-1,1)
        customer_predict = binarizer.transform(pred_proba)
        result[customer_threshold] = get_clf_eval(y_test, customer_predict, pred_proba)
    return result

In [None]:
thresholds = [0.3, 0.33, 0.36, 0.39, 0.42, 0.45, 0.48, 0.50]
pd.DataFrame(get_eval_by_threshold(y_test, pred, pred_proba.reshape(-1, 1), thresholds))