# 피마 인디언 당뇨병(Pima Indians Diabetes Dataset)
---

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my
from sklearn import datasets

### 데이터 준비

In [50]:
# !pip install gdown
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/diabetes.csv

In [51]:
diabetes = pd.read_csv('diabetes.csv')

In [52]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 이상치 확인

In [53]:
def iszero(x):
    return x==0

In [54]:
diabetes.apply(iszero)
df = diabetes

In [55]:
diabetes.apply(iszero).sum(axis=0)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [56]:
df['Glucose'].replace(0,np.nan,inplace=True)

In [57]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [58]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI'] :
    df[col].replace(0,np.nan,inplace=True)

In [59]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [60]:
# 이상치 처리하기
col_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']

for col in col_list :
    df[col].fillna(df[col].mean(),inplace=True)

In [61]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


In [62]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [75]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

In [76]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=0.2,
                                                random_state=2022)
X_train.shape,X_test.shape,y_train.shape

((614, 8), (154, 8), (614,))

In [77]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train[:5]

array([[ 1.15126242,  0.10913333,  1.22790579,  0.76668616, -0.59332894,
         0.86411993, -0.34533996,  1.30371686],
       [-0.293699  , -0.86187652, -1.36191676,  0.53465576, -0.51023245,
        -1.11461017,  1.61231523,  0.45502925],
       [-0.58269128, -0.44110558,  1.55163361, -0.02762125, -0.02888748,
         1.46634214,  1.28658773,  1.72806066],
       [-0.00470671, -0.21453662, -0.06700548, -2.01767871, -0.84261839,
        -1.48741439,  0.0297402 ,  0.28529173],
       [ 1.72924699, -1.05607849,  0.98510993,  0.30262535, -0.02888748,
         0.34792947,  1.22078419,  1.89779819]])

In [81]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train,y_train)


In [82]:
lg.score(X_train,y_train) # R^2

0.7703583061889251

### 테스트 예측

In [84]:
X_test = scaler.transform(X_test)
y_test = y_test.values

In [85]:
y_pred = lg.predict(X_test)

In [86]:
y_pred

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],
      dtype=int64)

In [89]:
# 확률로 값을 알려준다.
y_pred = lg.predict_proba(X_test)
y_pred[:10]

array([[0.36639347, 0.63360653],
       [0.55531538, 0.44468462],
       [0.79307605, 0.20692395],
       [0.28080615, 0.71919385],
       [0.90088375, 0.09911625],
       [0.52731613, 0.47268387],
       [0.66132815, 0.33867185],
       [0.53158641, 0.46841359],
       [0.84102706, 0.15897294],
       [0.71592232, 0.28407768]])

In [90]:
np.argmax(y_pred,axis=1)

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],
      dtype=int64)