# 데이터셋 출처

## 데이터 구성
- Pregnancies: 임신횟수
- Glucose: 2시간 동안의 경구 포도당 내성 검사에서 혈장 포도당 농도
- BloodPressure: 이완기 혈압
- SkinThickness: 삼두근 피부 주름 두께 (체지방 추정하는데 사용되는 값)
- Insulin: 2시간 혈청 인슐린
- BMI: 체질량 지수
- DiabetesPedigreeFunction: 당뇨병 혈통 기능
- Age: 나이
- Outcome: 768개 중에 268개의 결과 클래수 변수(0 또는 1) 당뇨 걸렸으면 1

# 필요한 라이브러리 로드

In [1]:
# Pandas: 데이터 분석
# Numpy: 수치계산
# Seaborn: 시각화
# matplotlib.pyplot: 시각화

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 구버전 주피터노트북에서 설정해줘야 그림이 나타난다.
%matplotlib inline

# 데이터셋 로드

In [2]:
df = pd.read_csv('diabetes_feature.csv')
df.shape

(768, 16)

In [3]:
# 데이터셋을 미리보기 합니다.

df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,0,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,0,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False


# 학습과 예측에 사용할 데이터셋 만들기

In [4]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [5]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
   'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
   'Insulin_nan', 'low_glu_insulin']]
X.shape

(768, 9)

In [6]:
y = df['Outcome']
y.shape

(768,)

In [7]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만듭니다.

### train_test_split 파라미터 살펴보기 ###
# X: 독립변수들
# y: 종속변수
# test_size: 테스트샘플 비율
# random_state: 매번 샘플링할때마다 같은 데이터셋 가져오기

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# train 세트의 문제와 정답의 데이터 수를 확인해 주세요.
X_train.shape, y_train.shape

((614, 9), (614,))

In [9]:
# test 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_test.shape, y_test.shape

((154, 9), (154,))

# 여러 개의 알고리즘을 사용해서 비교하기
- 여러 개의 알고리즘과 다양한 하이퍼 파라미터들을 다 일일히 사람이 손보는것이 힘들기 때문에 이 방법을 사용한다.

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [77]:
results = []
for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [89]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10) # 30~100% 까지 사용

param_distributions = {"max_depth": max_depth,
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100, 200, 10)
    clf = RandomizedSearchCV(estimator,
                             param_distributions,
                             n_iter=100,
                             scoring='accuracy',
                             n_jobs=-1,
                             cv=5,
                             verbose=2,)

    clf.fit(X_train, y_train)
    
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    
    results.append(result)

results

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   13.0s finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   27.5s finished


[['DecisionTreeClassifier',
  {'max_features': 0.7568792067069381, 'max_depth': 5},
  0.8664934026389444,
  0.8701298701298701,
  {'mean_fit_time': array([0.00502672, 0.00459504, 0.00419998, 0.0039875 , 0.00419502,
          0.00399013, 0.00346603, 0.00384555, 0.0033803 , 0.00358763,
          0.00313315, 0.00292468, 0.00280981, 0.00294285, 0.00336099,
          0.00347342, 0.00367494, 0.00400462, 0.0037437 , 0.00325122,
          0.0039259 , 0.00337996, 0.0031908 , 0.00320458, 0.00360203,
          0.00391636, 0.00427713, 0.00433526, 0.00383496, 0.00402179,
          0.00365338, 0.00327444, 0.0028604 , 0.00287437, 0.00305667,
          0.00292006, 0.00293798, 0.00296793, 0.00298262, 0.00293193,
          0.0036468 , 0.00325785, 0.00302505, 0.00283737, 0.00345702,
          0.00334554, 0.00336857, 0.00400515, 0.0037436 , 0.00445614,
          0.00351157, 0.00312791, 0.00297933, 0.00319901, 0.00377769,
          0.00330148, 0.00417743, 0.00438452, 0.00348434, 0.00351796,
          0.003

In [91]:
df = pd.DataFrame(results,
                  columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_results']) # results 는 지금 2차 리스트

In [95]:
pd.DataFrame(df.loc[1, 'cv_results']).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
49,0.339646,0.003841,0.022717,0.000757,125,0.56487,11,"{'n_estimators': 125, 'max_features': 0.564870...",0.886179,0.934959,0.853659,0.894309,0.950820,0.903985,0.034918,1
42,0.485826,0.002239,0.031423,0.001020,175,0.614533,17,"{'n_estimators': 175, 'max_features': 0.614532...",0.878049,0.934959,0.853659,0.902439,0.950820,0.903985,0.035667,1
22,0.339599,0.002508,0.024356,0.001899,125,0.614533,11,"{'n_estimators': 125, 'max_features': 0.614532...",0.886179,0.934959,0.853659,0.894309,0.950820,0.903985,0.034918,1
46,0.300809,0.004050,0.020039,0.000316,108,0.614533,14,"{'n_estimators': 108, 'max_features': 0.614532...",0.886179,0.934959,0.861789,0.894309,0.942623,0.903972,0.030475,4
87,0.324633,0.004020,0.022477,0.000570,109,0.56487,17,"{'n_estimators': 109, 'max_features': 0.564870...",0.886179,0.934959,0.861789,0.894309,0.942623,0.903972,0.030475,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.345274,0.003053,0.024739,0.000373,127,0.897048,2,"{'n_estimators': 127, 'max_features': 0.897048...",0.796748,0.910569,0.821138,0.821138,0.942623,0.858443,0.057259,96
25,0.313591,0.003642,0.022371,0.000602,127,0.56487,2,"{'n_estimators': 127, 'max_features': 0.564870...",0.796748,0.878049,0.845528,0.813008,0.934426,0.853552,0.049144,97
70,0.288575,0.010583,0.022047,0.000506,100,0.614533,2,"{'n_estimators': 100, 'max_features': 0.614532...",0.788618,0.878049,0.845528,0.813008,0.926230,0.850287,0.048484,98
35,0.418813,0.006777,0.027670,0.001280,156,0.861292,2,"{'n_estimators': 156, 'max_features': 0.861292...",0.788618,0.869919,0.837398,0.804878,0.942623,0.848687,0.054644,99
