### 피마 인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

1) 데이터 전처리 

In [2]:
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

In [3]:
df = pd.read_csv('../data/pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.columns = ['P', 'G', 'BP', 'S', 'I', 'BMI', 'D', 'Age', 'Target']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
df.Target.value_counts()

Target
0    500
1    268
Name: count, dtype: int64

In [5]:
df.isna().sum().sum()

0

In [6]:
df.iloc[:, :-1].head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
# X, y 데이터를 ndarray로 추출
# 일반적인 방법으로 대부분의 CSV에 적용 가능
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

#y = df.Target, y = df['Target'] # Series 형태의 데이터도 무방하나 인덱스의 값이 따라와서 문제가 있다.

In [8]:
X.shape, y.shape

((768, 8), (768,))

- Train / Test 데이터 분리

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=2023,
    stratify=y,
    test_size=0.2
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [10]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

In [11]:
np.unique(X_test, return_counts=True)

(array([0.000e+00, 1.000e-01, 1.010e-01, 1.070e-01, 1.230e-01, 1.340e-01,
        1.420e-01, 1.430e-01, 1.530e-01, 1.550e-01, 1.670e-01, 1.780e-01,
        1.790e-01, 1.830e-01, 1.860e-01, 1.870e-01, 1.890e-01, 1.900e-01,
        2.000e-01, 2.030e-01, 2.040e-01, 2.050e-01, 2.070e-01, 2.210e-01,
        2.250e-01, 2.330e-01, 2.350e-01, 2.370e-01, 2.400e-01, 2.450e-01,
        2.470e-01, 2.510e-01, 2.540e-01, 2.570e-01, 2.600e-01, 2.630e-01,
        2.670e-01, 2.680e-01, 2.690e-01, 2.700e-01, 2.830e-01, 2.840e-01,
        2.890e-01, 2.920e-01, 2.930e-01, 2.990e-01, 3.000e-01, 3.030e-01,
        3.060e-01, 3.150e-01, 3.170e-01, 3.240e-01, 3.360e-01, 3.380e-01,
        3.420e-01, 3.430e-01, 3.440e-01, 3.490e-01, 3.550e-01, 3.710e-01,
        3.750e-01, 3.820e-01, 3.890e-01, 3.910e-01, 3.950e-01, 4.000e-01,
        4.020e-01, 4.040e-01, 4.080e-01, 4.090e-01, 4.120e-01, 4.150e-01,
        4.190e-01, 4.300e-01, 4.350e-01, 4.390e-01, 4.410e-01, 4.440e-01,
        4.520e-01, 4.530e-01, 4.600e-0

In [12]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=2023)

- 한가지 분류기로

In [13]:

dtc.fit(X_train, y_train)

In [14]:
pred = dtc.predict(X_test)

In [15]:

res_df = pd.DataFrame({'y 실제값': y_test, 'y 예측값': pred})
res_df.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,0


In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7402597402597403

In [17]:
dtc.score(X_test, y_test)

0.7402597402597403

- GridSearchCV 방법

In [18]:
params = {
    'max_depth' : [ 5, 6, 7],
    'min_samples_split' : [2, 3, 4]
}

In [19]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, 
    param_grid=params,
    scoring='accuracy',
    cv=5
)

In [20]:
# 학습
grid_dt.fit(X_train, y_train)

In [21]:
# 최적 분류기
grid_dt.best_params_

{'max_depth': 6, 'min_samples_split': 3}

In [22]:
grid_dt.best_score_

0.7491803278688525

- 예측 및 평가 - 학습 과정에서 최적의 파라메터로 훈련한 모델로 수행

In [23]:
best_dt = grid_dt.best_estimator_

In [24]:
# 예측해보기
pred2 = best_dt.predict(X_test)

In [25]:
res_df2 = pd.DataFrame({'y 실제값':y_test, 'y 예측값':pred2})
res_df2.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1


In [26]:
best_dt.score(X_test, y_test)

0.7207792207792207

- 실제 적용

In [27]:
X_test[10], y_test[10]

(array([  0.   , 101.   ,  62.   ,   0.   ,   0.   ,  21.9  ,   0.336,
         25.   ]),
 0)

In [28]:
test_data, test_target = X_test[10], y_test[10]

In [29]:
# predict() 메소드를 사용하기 위해서는 2차원 모양이 되어야 함
best_dt.predict(test_data.reshape(1, -1))[0]

0