In [2]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

# 비만의 환경적 요인
1. 비만: 비만 상태가 계속되면 인슐린이 혈당을 잘 낮추지 못하고, 췌장의 인슐린 분비능도 떨어져 당뇨병이 생긴다


2. 노화: 당뇨병은 중년 이후에 많이 발생하며, 연령이 높아질 수록 발병률이 높아진다.
3. 식생활: 탄수화물과 지방을 과다 섭취하면 체중이 늘고 당뇨병이 생긴다.
4. 운동 부족: 비만을 초래하고, 근육을 약화시키며, 저항력이 떨어진다.
5. 스트레스: 부신피질호르몬 분비를 늘린다.
6. 임신성 당뇨병: 4kg 이상의 아이를 출산한 병력이 있으면 임신성 당뇨병이 생길 가능성이 있다.
7. 고혈압: 보통 당뇨병과 동반된다.
8. 이상지질혈증: 보통 당뇨병과 동반된다. 고콜레스테롤혈증, 고중성지방혈증

#### diabetes _ 012 _ health _ indicators _ BRFSS2015.csv 
-235,680개의 data.

-0은 당뇨 없음 or 임산부, 1은 당뇨 전단계, 2는 당뇨.

-Feature 21개, 데이터 불균형 있음

#### diabetes _ binary _ 5050split _ health _ indicators _ BRFSS2015.csv 
-70,692개의 data.

-0은 당뇨 없음, 1은 당뇨.

-Feature 21개, 데이터 불균형 없음

#### diabetes _ binary _ health _ indicators _ BRFSS2015.csv
-235,680개의 data. 

-0은 당뇨 없음, 1은 당뇨.

-Feature 21개, 데이터 불균형 있음

In [3]:
data = pd.read_csv('Data\diabetes_012_health_indicators_BRFSS2015.csv')

In [4]:
df = pd.DataFrame(data)
df['Age'] = df.apply(lambda x: 18 if x.Age == 1 else (x.Age*5+15), axis=1)

In [5]:
df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

## 주목해야할 Column

1. HighPB: 당뇨와 함께 나타나는 대표적인 질병

2. HighChol: 이상지질혈증

3. BMI: 당뇨병의 대표적인 원인

4. Smoker: 흡연이 당뇨의 원인은 아니다. 따라서 당뇨 예측에는 고려하지 않는다.

5. Stroke: 당뇨병 대표 합병증. 뇌졸중을 겪고나서 당뇨병임을 아는 경우도 있다고 한다.

6. HeartDiseaseorAttack: Stroke와 마찬가지.

7. PhysActivity: 신체적 활동은 당뇨병 예방 및 치료에 가장 중요한 요인 중 하나.

8. Fruits: 탄수화물 섭취가 많으면 당뇨에 걸릴 가능성이 높아진다.

9. Age: 중년 이후 당뇨병에 걸릴 확률이 높아진다.

In [6]:
df1 = df[['Diabetes_012', 'Age', 'BMI', 'HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits']]
df1

Unnamed: 0,Diabetes_012,Age,BMI,HighBP,HighChol,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits
0,0.0,60.0,40.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,50.0,25.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,60.0,28.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,70.0,27.0,1.0,0.0,0.0,0.0,1.0,1.0
4,0.0,70.0,24.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
253675,0.0,40.0,45.0,1.0,1.0,0.0,0.0,0.0,1.0
253676,2.0,70.0,18.0,1.0,1.0,0.0,0.0,0.0,0.0
253677,0.0,25.0,28.0,0.0,0.0,0.0,0.0,1.0,1.0
253678,0.0,50.0,23.0,1.0,0.0,0.0,0.0,0.0,1.0


In [7]:
train_data = df1.sample(frac=0.8, random_state=123)
temp_data = df1.drop(train_data.index)
test_data = temp_data.sample(frac=0.5, random_state=123)
valid_data = temp_data.drop(test_data.index)

In [8]:
train_data.shape, test_data.shape, valid_data.shape

((202944, 9), (25368, 9), (25368, 9))

In [9]:
x_train = train_data[['Age', 'BMI', 'HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits']].to_numpy().astype(np.float32)
y_train = train_data['Diabetes_012'].to_numpy().astype(np.float32).reshape(-1,1)
x_train, y_train

(array([[50., 33.,  0., ...,  0.,  0.,  0.],
        [50., 24.,  0., ...,  0.,  1.,  1.],
        [60., 39.,  0., ...,  0.,  1.,  0.],
        ...,
        [70., 25.,  1., ...,  1.,  1.,  0.],
        [65., 35.,  1., ...,  0.,  1.,  1.],
        [60., 25.,  0., ...,  0.,  1.,  0.]], dtype=float32),
 array([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]], dtype=float32))

In [10]:
x_test = test_data[['Age', 'BMI', 'HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits']].to_numpy().astype(np.float32)
y_test = test_data['Diabetes_012'].to_numpy().astype(np.float32).reshape(-1,1)
x_test.shape, y_test.shape

((25368, 8), (25368, 1))

In [11]:
x_valid = valid_data[['Age', 'BMI', 'HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits']].to_numpy().astype(np.float32)
y_valid = valid_data['Diabetes_012'].to_numpy().astype(np.float32).reshape(-1,1)
x_valid.shape, y_valid.shape

((25368, 8), (25368, 1))

In [12]:
# W = tf.Variable(tf.random.normal((x_train.shape[1], y_train.shape[1])), dtype=tf.float32)
# b = tf.Variable(tf.random.normal((y_train.shape[1],)), dtype= tf.float32)

# hypothesis = tf.matmul(x_train, W) + b


# def Cost():
#     return tf.reduce_mean(tf.square(hypothesis - y_train))
# epochs = 300
# learning_rate = 0.5
# optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

# training_idx = np.arange(0, epochs+1, 1)
# cost_graph = np.zeros(epochs+1)

# for cnt in range(0, epochs+1):
#     cost_graph[cnt] = Cost()
#     if cnt % (epochs//20) == 0:
# #         print("[{:>6}] cost={:>6.4}, W = [[{:>7.4} {:>7.4}] [{:>7.4} {:>7.4}]], B = [{:>7.4}{:>7.4}]".format(cnt, cost_graph[cnt], W[0,0], W[0,1], W[1,0],W[1,1], B[0],B[1]))
#         print("[{:>6}] cost={:>6.4}, W = {:>7.4}, B = {:>7.4}]".format(cnt, cost_graph[cnt], W[0,0], b[0]))
    
#     optimizer.minimize(Cost,[W, b])  

In [14]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(2, activation="relu"))
model.add(tf.keras.layers.Dense(3, activation="relu"))
model.add(tf.keras.layers.Dense(4, activation='sotfmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

ValueError: Unknown activation function: sotfmax. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.