# 라이브러리 불러오기

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn') 
sns.set(font_scale = 2.5)

import missingno as msno

# warning 무시하기
import warnings
warnings.filterwarnings('ignore') 

%matplotlib inline 

# 데이터 확인하기

In [None]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df

In [None]:
df.shape

# Null data 확인하기

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
                     
df['target'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow = True)
ax[0].set_title('Pie plot - Heart Disease')
ax[0].set_ylabel('')
sns.countplot('target', data=df, ax=ax[1])
ax[1].set_title('Count plot - Heart Disease')
plt.show()

# EDA - CP(가슴 통증 유형)
0 - 전형적인 협심증, 1 - 비전형적인 협심증, 2 - 비협심증 통증, 3 - 증상 없음

In [None]:
df[['cp', 'target']].groupby(['cp'], as_index=True).count().sort_values(by='target', ascending=False) 

In [None]:
pd.crosstab(df['cp'], df['target'], margins=True)

In [None]:
df[['cp', 'target']].groupby(['cp'], as_index=True).mean().sort_values(by='target', ascending=False)

In [None]:
f, ax = plt.subplots(1, 2, figsize=(30, 10)) 
df['cp'].value_counts().plot.bar(color=['#ff0000', '#ffff00', '#228b22', '#0000ff'], ax=ax[0]) 
ax[0].set_title('Number of Patients by cp') 
ax[0].set_ylabel('Count') 
sns.countplot('cp', hue='target', data=df, ax=ax[1])
ax[1].set_title('cp : Normal vs Heart Disease') 
plt.show()

1(비전형적인 협심증), 2(비협심증 통증), 3(증상 없음), 0(전형적인 협심증) 순으로 심장별의 걸릴 확률이 높다.

# EDA - sex(성별)
1 - 남성, 0 - 여성

In [None]:
df[['sex', 'target']].groupby(['sex'], as_index=True).count() # Sex별 심장병 숫자

In [None]:
pd.crosstab(df['sex'], df['target'], margins=True)

In [None]:
df[['sex', 'target']].groupby(['sex'], as_index=True).mean() # Sex별 심장병 비율

In [None]:
f, ax = plt.subplots(1, 2, figsize=(30, 10))
df[['sex', 'target']].groupby(['sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Number of Patients by sex')
sns.countplot('sex',hue='target', data=df, ax=ax[1]) 
ax[1].set_title('Sex : Heart Disease vs Normal')
plt.show()

남성보다 여성의 경우 심장병에 걸릴 확률이 높다.

# EDA - exang(협심증 유발 운동)
1 - yes, 0 - no

In [None]:
df[['exang', 'target']].groupby(['exang']).count()

In [None]:
pd.crosstab(df['exang'], df['target'], margins = True)

In [None]:
df[['exang', 'target']].groupby(['exang']).mean()

In [None]:
f, ax = plt.subplots(1, 2, figsize=(30, 10))
df[['exang', 'target']].groupby(['exang'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Number of Patients by exang')
sns.countplot('exang',hue='target', data=df, ax=ax[1]) 
ax[1].set_title('exang : Heart Disease vs Normal')
plt.show()

# EDA - age

In [None]:
df[['age']].describe()

In [None]:
print("심장병 발병 환자 중 나이가 제일 많은 환자 : {:.1f} years".format(df[df["target"] == 1]["age"].max()))
print("심장병 발병 환자 중 나이가 제일 어린 환자 : {:.1f} years".format(df[df["target"] == 1]["age"].min()))
print("심장병 발병 환자 중 평균 나이 : {:.1f} years".format(df[df["target"] == 1]["age"].mean()))

In [None]:
pd.crosstab(df['target'], df['age'], margins = True)

In [None]:
df[['age', 'target']].groupby(['target']).mean()

In [None]:
df[['age', 'target']].groupby(['age']).mean()

In [None]:
# 생존 여부에 따른 나이 확률 분포
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(df.age, ax = ax)
sns.kdeplot(df.age[df.target == 1], ax = ax)
sns.kdeplot(df.age[df.target == 0], ax = ax)
plt.legend(["All", "target == 1", "target == 0"])
plt.show()     

In [None]:
df[df["target"] == 1]["age"].sort_values()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(30, 8))
df[['age', 'target']].groupby(['age'], as_index=True).count().plot.bar(ax=ax)
ax.set_title('Number of Patients by age')
plt.show()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(30, 8))
df[['age', 'target']].groupby(['age'], as_index=True).mean().plot.bar(ax=ax)
ax.set_title('Percentage of target 1 by age')
plt.show()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(30, 8))
sns.countplot('age', hue='target', data=df, ax=ax)
ax.set_title('age : Normal vs Heart Disease', y = 1.02)
plt.show()

# EDA - thalach(최대 심장박동수)

In [None]:
df[['thalach']].describe()

In [None]:
print("심장병 발병 환자 중 최대 심박동수 : {:.1f} bpm".format(df[df["target"] == 1]["thalach"].max()))
print("심장병 발병 환자 중 최소 심박동수 : {:.1f} bpm".format(df[df["target"] == 1]["thalach"].min()))
print("심장병 발병 환자 중 심박동수 평균 : {:.1f} bpm".format(df[df["target"] == 1]["thalach"].mean()))

In [None]:
pd.crosstab(df['thalach'], df['target'], margins = True)

In [None]:
df[['thalach', 'target']].groupby(['thalach']).mean()

In [None]:
# 생존 여부에 따른 나이 확률 분포
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(df.thalach, ax = ax)
sns.kdeplot(df.thalach[df.target == 1], ax = ax)
sns.kdeplot(df.thalach[df.target == 0], ax = ax)
plt.legend(["All", "target == 1", "target == 0"])
plt.show()     

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Data Correlation Info.", y = 1.02)
sns.heatmap(data = df.corr(), annot=True, annot_kws = {'size' : 14}, fmt = '.2f', cmap='GnBu')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
plt.scatter(df.age[df.target == 1], df.thalach[df.target == 1], c = 'red', s = 100)
plt.scatter(df.age[df.target == 0], y = df.thalach[df.target == 0], c = 'blue', s = 100)
plt.title("Correlation between 'age' and 'thalach'", y = 1.02)
plt.legend(["target == 1", "target == 0"])
plt.xlabel("age")
plt.ylabel("thalach")
plt.show()

# 데이터 준비

1. 행렬 생성

In [None]:
X_data = df.drop(["target"], axis = 1)
# Min-Max Normalization
X = (X_data - np.min(X_data))/(np.max(X_data)-np.min(X_data)).values
X

In [None]:
Y = df.target.values
Y

2. 데이터 분리하기

In [None]:
from sklearn.model_selection import train_test_split

# train : validation : test = 64 : 16 : 20 / Stratify / seed = 0
X_t, X_test, Y_t, Y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state = 0)
X_train, X_validation, Y_train, Y_validation = train_test_split(X_t, Y_t, test_size = 0.2, stratify = Y_t, random_state = 0)

X_train = X_train.T
X_validation = X_validation.T
X_test = X_test.T
X_train.shape
m = X_train.shape[1]  # m = Sample
n = X_train.shape[0]  # n = Feature

# 모델 함수 생성

1. Initialization

In [None]:
# w = (0.01, 0.01, ... , 0.01), b = 0.0
def Initialization(feature):
    w = np.full((feature, 1), 0.01)
    b = 0.0
    
    return w, b

2. Sigmoid

In [None]:
def Sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    
    return A

3. Forward & Backward Propagation

In [None]:
def Propagation(w, b, X_train, Y_train):
    # Forward
    Z = np.dot(w.T, X_train) + b
    A = Sigmoid(Z)
    loss = -(Y_train*np.log(A) + (1 - Y_train)*np.log(1 - A))
    cost = (np.sum(loss)) / m
    
    # Backward
    # da = (y/a) + (1-y)/(1-a)
    # dz = a - y
    dw = (1/m) * np.dot(X_train, (A - Y_train).T)
    db = (1/m) * np.sum(A - Y_train)
    
    # gradient dictionary
    grads = {"dw" : dw, "db" : db}
    
    return cost, grads

4. Update

In [None]:
def Update(w, b, X_train, Y_train, lr, iterations):
    cost_list = []
    cost_list2 = []
    index = []
    
    for i in range(iterations):
        cost, grads = Propagation(w, b, X_train, Y_train)
        cost_list.append(cost)
        
        w = w - lr * grads["dw"]
        b = b - lr * grads["db"]
        
        if i % 5 == 0: # 5번 마다 추가
            cost_list2.append(cost)
            index.append(i)
            
    parameters = {"w" : w, "b" : b}
    plt.plot(index, cost_list2)
    plt.xlabel("iter")
    plt.ylabel("cost")
    plt.show()
    
    return parameters, cost_list

5. Prediction

In [None]:
# Validation set
def Predict_validation(w, b, validation):
    Z = Sigmoid(np.dot(w.T, validation) + b)
    Y_prediction = np.zeros((1, validation.shape[1]))
    
    for i in range(Z.shape[1]):
        if Z[0, i] <= 0.5:
            Y_prediction[0, i] = 0
        else:
            Y_prediction[0, i] = 1
            
    return Y_prediction  

In [None]:
# Test set
def Predict_test(w, b, test):
    Z = Sigmoid(np.dot(w.T, test) + b)
    Y_prediction = np.zeros((1, test.shape[1]))
    
    for i in range(Z.shape[1]):
        if Z[0, i] <= 0.5:
            Y_prediction[0, i] = 0
        else:
            Y_prediction[0, i] = 1
            
    return Y_prediction  

6. Logistic Regression

In [None]:
X_train.shape

In [None]:
def Logistic_regression(X_train, Y_train, X_validation, Y_validation, lr, iterations):
    feature = X_train.shape[0]
    w, b = Initialization(feature)
    
    parameters, cost_list = Update(w, b, X_train, Y_train, lr, iterations)
    
    Y_prediction_validation = Predict_validation(parameters["w"], parameters["b"], X_validation)
    
    print("Validation accuracy : {}%".format(100 - np.mean(np.abs(Y_prediction_validation - Y_validation))*100))
    
    return parameters

# 모델 생성

In [None]:
predicted_parameters = Logistic_regression(X_train, Y_train, X_validation, Y_validation, lr=0.1, iterations = 500)
predicted_parameters

In [None]:
X_test.T

# 예측

In [None]:
# 임의의 환자
patient_index = 234

# 예측 값
predicted_target = Predict_test(predicted_parameters["w"], predicted_parameters["b"], (X_test.T.loc[[patient_index]]).T)

# 실제 값
real_target = (df.loc[patient_index]['target'])

print("predicted:", int(predicted_target), "\nreal:", int(real_target))   

if (predicted_target == real_target):
    print("Model is correct")
else:
    print("Model is wrong")