# 피마 인디언 당뇨병 (Pima Indians Diabetes Dataset)
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

### 데이터 준비

In [3]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/diabetes.csv

In [7]:
diabetes = pd.read_csv('./diabetes.csv')
diabetes.shape

(768, 9)

In [13]:
diabetes.columns = diabetes.columns.str.lower()

In [14]:
diabetes.head() #확인하면 insulin 이나 skinthickness 값들이 0으로 설정 돼있는데 결측치를 0으로 표기했다고 생각하면 된다. 추후 정확한 분석을 위해 수정을 하면 좋다.

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
diabetes['pregnancies'].value_counts()

1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: pregnancies, dtype: int64

### 이상치 확인

In [18]:
def iszero(x):
    return x == 0

In [21]:
diabetes.apply(iszero).sum(axis=0) # .apply(iszero) == 0인 값엔 True, 아닌 곳은 False로 바꾸는 효과, .sum(axis=0) 을 붙임으로써 각 열에 0인 값인 행의 개수를 보여준다
#보면 glucose~bmi까지의 0은 말이 안되는데 존재하는 모습

pregnancies                 111
glucose                       5
bloodpressure                35
skinthickness               227
insulin                     374
bmi                          11
diabetespedigreefunction      0
age                           0
outcome                     500
dtype: int64

In [47]:
#이상치를 NaN 결측치로 변경. 이렇게 하는 이유는 isna(), dropna(), fillna()로 쉽게 결측치를 확인, 드랍, 변경 가능하다
df = diabetes.copy()

In [48]:
df.loc[df['glucose']==0] #글루코스파트 0인 행 전부 출력

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
75,1,0,48,20,0,24.7,0.14,22,0
182,1,0,74,20,23,27.7,0.299,21,0
342,1,0,68,35,0,32.0,0.389,22,0
349,5,0,80,32,0,41.0,0.346,37,1
502,6,0,68,41,0,39.0,0.727,41,1


In [49]:
#for문을 통해 glucose부터 bmi까지 0인 항목을 모두 NaN으로 변경하기.
for col in df.columns[1:6]:
    df[col].replace(0,np.nan,inplace=True)
df.head()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [50]:
#결측치를 모두 평균값으로 바꾸려면
for col in df.columns[1:6]:
    df[col].fillna(df[col].mean(),inplace=True)
df.head()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [52]:
df.isna().sum(axis=0) #결측치 모두 없어짐을 확인

pregnancies                 0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64

In [53]:
X = df.drop(['outcome'],axis=1)
y = df['outcome']

In [90]:
#Train, test 데이터셋으로 나누기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.2,
                                                    stratify=y)
print(X_train.shape, X_test.shape)

(614, 8) (154, 8)


In [91]:
#스케일링 진행
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
print(type(X_train), type(y_train))


<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


In [92]:
y_train = y_train.values #Series 자료형을 ndarray로 바꿔준다.

In [93]:
print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [94]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression()

In [95]:
lg.score(X_train,y_train) # R2   (1에 가까울 수록 좋은 예측 결과를 말함)

0.7768729641693811

### 테스트

In [96]:
X_test = scaler.transform(X_test)
y_test = y_test.values

In [97]:
y_pred = lg.predict(X_test)
y_pred

array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
      dtype=int64)

In [101]:
y_pred = lg.predict_proba(X_test)   #확률로 값을 예측하는 함수. Predict Probability
y_pred[:10]
# [0일 확률, 1일 확률] 의 값들이 여러개 있는 array가 만들어진다.
# 이 값들 기준으로 좀 더 높은 확률의 값을 y_pred에 넣는다. (위 y_pred와 대조하면 알 수 있다)

array([[0.82826712, 0.17173288],
       [0.337537  , 0.662463  ],
       [0.9868111 , 0.0131889 ],
       [0.61313799, 0.38686201],
       [0.70947985, 0.29052015],
       [0.21195645, 0.78804355],
       [0.39938416, 0.60061584],
       [0.91501392, 0.08498608],
       [0.36625507, 0.63374493],
       [0.84162945, 0.15837055]])

In [104]:
y_pred = np.argmax(y_pred,axis=1) #argmax는 둘중에 큰쪽이 어딘지 알려주는 함수. 이걸 통해 더 큰쪽의 값을 반환하여 위에 predict 결과값을 받아볼 수 있다.
y_pred

array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
      dtype=int64)

In [105]:
my.print_score(y_test,y_pred)

AttributeError: module 'myutils' has no attribute 'print_score'

In [106]:
my.plot_confusion_matrix(y_test,y_pred)

AttributeError: module 'myutils' has no attribute 'plot_confusion_matrix'