<img src='https://storage.googleapis.com/kaggle-datasets-images/228/482/a520351269b547c89afe790820a1087e/dataset-cover.jpeg'>
* ref : kaggle  : https://www.kaggle.com/uciml/pima-indians-diabetes-database
    
    

## Pima Indians Diabetes Database
* Predict the onset of diabetes based on diagnostic measures
* UCI Machine Learning - updated 5 years ago (Version 1)

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from sklearn.metrics import roc_curve, plot_roc_curve, roc_auc_score
from sklearn.preprocessing import Binarizer
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings(action='ignore')

sns.set()

In [8]:
def my_eval_chart(y_val, proba): 
    # --------------- 양성 평가를 위한 차트 : precision_recall_curv() -------------------    
    # plot_precision_recall_curve(model, 문제지2, 정답지2)
    precisions, recalls, ths = precision_recall_curve(y_val, proba[:, 1])
    plt.title("precision recall curv %0.4f"% f1)
    # plt.title(f"precision recall curv %0.4f" % f1)
    plt.xlabel("threadhold")
    plt.ylabel("value")
    plt.plot(ths, precisions[:ths.shape[0]], "b", label="precision")
    plt.plot(ths, recalls[:ths.shape[0]], "r", linestyle='--', label="recall")
    plt.grid()
    plt.legend()
    plt.show()

    # --------------- 양성 평가를 위한 차트 : roc_auc_score(), roc_curve() -------------------
    auc_score = roc_auc_score(y_val, proba[:, 1])

    fprs, tprs, ths = roc_curve(y_val, proba[:, 1])
    plt.title("roc curv %0.4f" % auc_score)
    # plt.title(f"precision recall curv %0.4f" % f1)
    plt.xlabel("FPR(1-specificity)")
    plt.ylabel("TPR")
    plt.grid()
    plt.plot(fprs, tprs, "b")
    plt.plot([0, 1], [0, 1], '--', color='black')
    plt.show()

            
def my_eval(y_val, pred, proba, chart=False):  #avg='binary'
    accuracy = accuracy_score(y_val, pred)
    precision = precision_score(y_val, pred)
    recall = recall_score(y_val, pred)
    f1 = f1_score(y_val, pred)
    matrix = confusion_matrix(y_val, pred)
    print(f'정확도{accuracy:.4f} 정밀도:{precision:.4f} 재현률:{recall:.4f} f1:{f1:.4f}')
    print("오차행렬\n", matrix)
    # cls_report = classification_report(y_val, pred)    
    if chart==True:
        my_eval_chart(y_val, proba)

def fit_score(X_train, X_test, y_train, y_test, model=None, test_size=0.2):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)                                        #[0  1]              th:0.5
    proba = model.predict_proba(X_test)                                 #[[.7 >.3] [.4  <.6]]  --> [0  1]
    my_eval(y_test, pred, proba) 


# 데이터 로드

In [9]:
df = pd.read_csv("./diabetes.csv")

# 우선 점수부터 보자
---------------------------------------
* 1. str X  
* 2. nan

In [10]:
y = df['Outcome']
X = df.drop('Outcome', axis=1)
#----------------------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=1414,shuffle=False)
#----------------------------------------------------------------------------------------------
model = RandomForestClassifier(n_estimators=500,random_state=1414)  #0.7597402597402597
#----------------------------------------------------------------------------------------------
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict(X_test)
log_loss = log_loss(y_test, proba)
print(accuracy_score(y_test, pred))
print(f1_score(y_test, pred))
print(log_loss)

0.7597402597402597
0.6262626262626263
8.298344944014856


# OPTUNA : 하이퍼 파라미터 최적화 API

In [14]:
# 1 objective function
# 2 hyperparameter values using trial object. 
import optuna
def my_objective(trial):
    myparam = {
    "min_samples_split" : trial.suggest_int('min_samples_split', 1,3),
    "max_depth" : trial.suggest_uniform('max_depth', 1, 20),
    }
    model = RandomForestClassifier(**myparam)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    proba = model.predict_proba(X_test)
    score = log_loss(y_test, proba)
    return score

# 3  Create a study object and invoke the optimize method over 100 trials
study = optuna.create_study()
study.optimize(my_objective, n_trials=30)
best_param = study.best_params
print(study.best_params)

# 4 best_param 적용
model = RandomForestClassifier(**best_param)
model.fit(X_train, y_train)
pred = model.predict(X_test)
proba = model.predict_proba(X_test)
score = log_loss(y_test, proba)
print(score)


[32m[I 2021-10-28 21:35:01,405][0m A new study created in memory with name: no-name-4e360a43-0caf-46c8-94e0-0f045f35d149[0m
[33m[W 2021-10-28 21:35:01,597][0m Trial 0 failed because of the following error: TypeError("'numpy.float64' object is not callable")[0m
Traceback (most recent call last):
  File "f:\workspace\venv\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\709\AppData\Local\Temp/ipykernel_3776/927862403.py", line 13, in my_objective
    score = log_loss(y_test, proba)
TypeError: 'numpy.float64' object is not callable


TypeError: 'numpy.float64' object is not callable