### 피마 인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

1) 데이터 전처리 

In [None]:
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

In [3]:
df = pd.read_csv('../data/pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.columns = ['P', 'G', 'BP', 'S', 'I', 'BMI', 'D', 'Age', 'Target']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.isna().sum().sum()

0

In [12]:
df.iloc[:, :-1].head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [30]:
# X, y 데이터를 ndarray로 추출
# 일반적인 방법으로 대부분의 CSV에 적용 가능
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [22]:
X.shape, y.shape

((768, 8), (768,))

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=2023,
    stratify=y,
    test_size=0.2
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [32]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=2023)

- 한가지 분류기로

In [24]:

dtc.fit(X_train, y_train)

In [25]:
pred = dtc.predict(X_test)

In [26]:

res_df = pd.DataFrame({'y 실제값': y_test, 'y 예측값': pred})
res_df.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,0


In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7402597402597403

In [28]:
dtc.score(X_test, y_test)

0.7402597402597403

- GridSearchCV 방법

In [43]:
params = {
    'max_depth' : [ 5, 6, 7],
    'min_samples_split' : [2, 3, 4]
}

In [44]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, 
    param_grid=params,
    scoring='accuracy',
    cv=5
)

In [45]:
# 학습
grid_dt.fit(X_train, y_train)

In [46]:
# 최적 분류기
grid_dt.best_params_

{'max_depth': 6, 'min_samples_split': 3}

In [47]:
grid_dt.best_score_

0.7491803278688525

In [49]:
# 예측해보기
pred2 = grid_dt.predict(X_test)

In [50]:
res_df2 = pd.DataFrame({'y 실제값':y_test, 'y 예측값':pred2})
res_df2.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1
