# Brain Stroke Prediction - 뇌경색 분포

**제작:** [Kang junmo]<br>
**생성일:** 2022/08/06<br>
**최근 수정일:** 2022/08/06<br>
**설명:** 다양한 독립변수를 이용해 이진분류모델 구현한다

## 개요
* 여러 독립변수를 전처리과정을 거쳐 이진분류(뇌경색O, 뇌경색X)모델을 구현한다.
* 알고리즘: 이진분류모델

## 참고
* My Kaggle site
> https://www.kaggle.com/code/progrkj/brainstroke-prediction
* 데이터 출처
> Kaggle: https://www.kaggle.com/datasets/jillanisofttech/brain-stroke-dataset
* 자연어전처리
> https://han-py.tistory.com/281

### GPU 사용

In [None]:
# 할당된 gpu 사용
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

## CODE

### 필요한 모듈 사용

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import datetime
import math

#### 중요 변수 지정

In [None]:
TEST_SIZE = 200
NOW_DATE = datetime.datetime.now().strftime('%Y%m%d')
NOW_TIME = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

print(NOW_TIME)

#### 중요 함수 저장

In [None]:
def mkdir(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
            return True
        else:
            return True
    except OSError:
        print(f'Error[utils.mkdir] {OSError}')

### 데이터 전처리

#### pd.read_csv 이용

In [None]:
# df = pd.read_csv('./brain_stroke.csv', encoding='ansi')
df = pd.read_csv('./archive/brain_stroke.csv')

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

#### 문자열 처리

In [None]:
# df["gender"] = np.where(df["gender"].to_numpy() == "Male", 0, 1)            # Male: 0, Female: 1
df["ever_married"] = np.where(df["ever_married"].to_numpy() == "Yes", 1, 0) # Yes: 1, No: 0

__df = df["work_type"]
df["work_type"] = np.select(
    condlist=[__df == "Private", __df == 'Govt_job', __df == 'children', __df == 'Self-employed'],
    choicelist=[1,2,3,4],
    default=np.nan
)

__df = df["Residence_type"]
df["Residence_type"] = np.where(__df.to_numpy() == "Urban", 1, 0)           # Urban: 1, Rural: 0
__df = df["smoking_status"]
df["smoking_status"] = np.select(
    condlist=[__df == "Unknown", __df == "never smoked", __df == "formerly smoked", __df == "smokes"],
    choicelist=[0,1,2,3],
    default=np.nan
)

#### One-hot encoding (원핫인코딩)

In [None]:
df = pd.get_dummies(df, prefix=['stroke', 'gender'], columns=['stroke', 'gender'])

### 데이터 표준화

In [None]:
age_min = np.min(df['age'])
age_max = np.max(df['age'])

avg_glucose_level_min = np.min(df['avg_glucose_level'])
avg_glucose_level_max = np.max(df['avg_glucose_level'])

bmi_min = np.min(df['bmi'])
bmi_max = np.max(df['bmi'])

df['age'] = (df['age']-age_min)/(age_max-age_min)
df['avg_glucose_level'] = (df['avg_glucose_level']-avg_glucose_level_min)/(avg_glucose_level_max-avg_glucose_level_min)
df['bmi'] = (df['bmi']-bmi_min)/(bmi_max-bmi_min)

#### 데이터 손실값 확인

In [None]:
df.isnull().sum()

### data frame 확인

In [None]:
print(df.shape)
print(df.dtypes)
df.head()

### 불필요한 컬럼 제외하고 데이터 구성

In [None]:
# ever_married, work_type 제외
# condition_format = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
condition_format = ['gender_Male', 'gender_Female', 'age', 'hypertension', 'heart_disease', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
X_train = df[condition_format][TEST_SIZE:]
X_test = df[condition_format][:TEST_SIZE]

result_format = ['stroke_0', 'stroke_1']
Y_train = df[result_format][TEST_SIZE:]
Y_test = df[result_format][:TEST_SIZE]
print(X_train.shape, Y_train.shape)

### 모델 구성

In [None]:
len(condition_format), len(result_format)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=[len(condition_format)]),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(len(result_format), activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

### 모델 학습

In [None]:
history = model.fit(
    X_train,
    Y_train,
    epochs=100,
    verbose=1,
    # (all rows - TEST_SIZE)/8
    steps_per_epoch=int((4982-TEST_SIZE)/8),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1)],
    validation_split=0.2
)

### 모델 저장하기

In [None]:
save_path = f'./results/{NOW_TIME}'

_save_path = f'{save_path}/datas'
mkdir(_save_path)
np.savez(f'{_save_path}/X_train.npz', np.array(X_train))
np.savez(f'{_save_path}/Y_train.npz', np.array(X_train))

In [None]:
_save_path = f'{save_path}/models'
model.save(f'{_save_path}/model.h5')
model.save(_save_path)

### 모델 검증

In [None]:
print(model.get_weights())

In [None]:
_predictResult = np.array(Y_test)
predictResult = np.argmax(_predictResult, axis=1)
_predictInfos = model.predict(X_test)
predictInfos = np.argmax(_predictInfos, axis=1)

RESULT_INFOS = []
for i in range(len(predictResult)):
    RESULT_INFOS.append({
        "rate_0": _predictInfos[i][0],
        "rate_1": _predictInfos[i][1],
        "predict": predictInfos[i],
        "res": predictResult[i],
        "accuracy": predictResult[i] == predictInfos[i]
    })

FilteredResultInfos = list(filter(lambda val: val['accuracy'] == True, RESULT_INFOS))
print(len(FilteredResultInfos))
print(f'accuracy percent: {len(FilteredResultInfos)/len(RESULT_INFOS)*100}%')

pd.DataFrame(RESULT_INFOS)