In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

sb.set()
%matplotlib inline

## Description

### age - age in years

### sex - (1 = 남; 0 = 여)

### cp - 가슴통증 종류

### trestbps - 안정혈압 (in mm Hg on admission to the hospital)

### chol - 혈청 콜레스테롤 in mg/dl

### fbs - (공복혈당 > 120 mg/dl) (1 = true; 0 = false)

### restecg - 안정시 심전도

### thalach - 최대 심박수

### exang - 운동 유발성 앙기나 (1 = yes; 0 = no)

### oldpeak - ST depression induced by exercise relative to rest

### slope - the slope of the peak exercise ST segment

### ca - number of major vessels (0-3) colored by flourosopy

### thal - 3 = normal; 6 = fixed defect; 7 = reversable defect

### target - have disease or not (1=yes, 0=no)

In [None]:
heart_df = pd.read_csv('../input/heart-disease-uci/heart.csv')
heart_df.head(10)

In [None]:
heart_df.info()

In [None]:
heart_df.notnull()

In [None]:
heart_df.shape

In [None]:
heart_df.dtypes

In [None]:
plt.figure(figsize = (20, 20))
num = 1

for i in heart_df:
    ax = plt.subplot(4,4,num)
    sb.distplot(heart_df[i])
    plt.xlabel(i, fontsize = 20)
    num += 1

plt.show()

In [None]:
plt.figure(figsize = (16, 8))

corr = heart_df.corr()              #  상관계수 연산(-1,1 사이의 결과) 연속성(숫자형) 데이터에 대해서만 연산, 인과관계를 의미하진 않음
mask = np.triu(np.ones_like(corr, dtype = bool))               # 삼각형으로 표현
sb.heatmap(corr, mask = mask, annot = True, fmt = '.2g', linewidths = 1)
plt.show()

In [None]:
heart_df.var()

### 숫자가 큰 데이터 3가지는 normalization

In [None]:
heart_df['trestbps']=np.log(heart_df['trestbps'])    
heart_df['chol']=np.log(heart_df['chol'])
heart_df['thalach']=np.log(heart_df['thalach'])

np.var(heart_df[["trestbps",'chol','thalach']])

In [None]:
x=heart_df.drop('target',axis=1)    
y=heart_df['target']

## 모델링

### 1. logistic regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=123)

In [None]:
accuracies={}
lr = LogisticRegression(penalty='none')      # l2로 할 경우 79%
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)

acc=accuracy_score(y_test,y_pred)
accuracies['Logistic Regression']=acc*100
print("Accuracy score of the model is:",accuracies['Logistic Regression'],"%")

In [None]:
print("Confusion matrix of the model",confusion_matrix(y_test,y_pred))

print("Classification Report",classification_report(y_test,y_pred))

### 2.decisiontreeclassfier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier                 # 부스팅 알고리즘 : 여러 개의 분류기를 순차적으로 돌린 후, 데이터의 가중치를 주어 다음 학습에 반영
from sklearn.ensemble import GradientBoostingClassifier   # GBM : AdaBoost에 gradient descent를 이용해 가중치 조절
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier                               # XGB : 병렬 학습으로 속도 빠름, 교차 검증, 가지치기 발전

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

y_pred3 = dtc.predict(x_test)
acc3=accuracy_score(y_test,y_pred)
accuracies['DecisionTreeClassifier']=acc3*100

accuracy_score(y_train,dtc.predict(x_train))
print("Accuracy score of the model is:",accuracy_score(y_test,y_pred3)*100,"%")

In [None]:
grid_params = {                                       # 데이터의 규칙 학습 -> 트리 구조로 생성, if/else문으로 주로 표현 
    'criterion' : ['gini', 'entropy'],                # 규칙 너무 많으면 overfitting, 적은 규칙으로 예측해야 함
    'max_depth' : [3, 5, 7, 10],                      # 정보 이득 계수(1 - 엔트로피) 엔트로피 기반, 다른 값이 섞일수록 엔트로피 상승
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(dtc, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(x_train, y_train)

In [None]:
y_grid = grid_search.predict(x_test)
acc_grid = accuracy_score(y_test,y_grid)
accuracies['GridSearchCV']=acc_grid*100
print("Accuracy score of the model is:",acc_grid*100,"%")

In [None]:
y_pred4 = dtc.predict(x_test)
acc4=accuracy_score(y_test,y_pred4)
accuracies['DecisionTreeClassifier']=acc4*100
print("Accuracy score of the model is:",accuracy_score(y_test,y_pred4)*100,"%")

### 3. randomforestclassfier  

In [None]:
rfc = RandomForestClassifier(criterion = 'gini', max_depth = 7, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 4, n_estimators = 180)
rfc.fit(x_train, y_train)                 # 결정트리(시각화에 용이) 기반, 배깅으로 각자 데이터 샘플링 후 개별적으로 학습, 부스트 트래핑: 여러 개 데이터 중첩 분리
                                          # gini : 0에 가까울수록 균일도 높은 것, features : 데이터 feature 참조 비율
y_pred5 = rfc.predict(x_test)


acc5=accuracy_score(y_test,y_pred5)
accuracies['RandomForestClassifier']=acc5*100


accuracy_score(y_train,rfc.predict(x_train))
print("Accuracy score of the model is:",accuracy_score(y_test,y_pred5)*100,"%")

In [None]:
print("Confusion matrix of the model",confusion_matrix(y_test,y_pred5))

print("Classification Report",classification_report(y_test,y_pred5))

## boosting
### 1.adaboosting

- Iteration 1을 학습한 뒤, 오류 내용(붉은 칸에 있는 +표시)에 대해 가중치를 부여

- 부여한 가중치를 바탕으로 Iteration 2를 학습. 오류 내용(푸른 칸에 있는 -표시)에 대해 가중치 부여

- 부여한 가중치를 바탕으로 Iteration 3를 학습

- 순차적으로 Iteration 1에 0.3, Iteration 2에 0.5, Iteration 3에 0.8의 가중치를 부여한 뒤 결합

- 결합한 결과가 Final Classifier


In [None]:
ada = AdaBoostClassifier(base_estimator = dtc)

parameters = {                                                   
    'n_estimators' : [50, 70, 90, 120, 180, 200],    # weak learner 계수, lr은 낮게 estimator는 크게
    'learning_rate' : [0.001, 0.01, 0.1, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, parameters, n_jobs = -1, cv = 5, verbose = 1)
grid_search.fit(x_train, y_train)

In [None]:
gbc = GradientBoostingClassifier()

parameters = {
    'loss': ['deviance', 'exponential'],    # 손실함수는 편차값, exponential   
    'learning_rate': [0.001, 0.1, 1, 10],
    'n_estimators': [100, 150, 180, 200]
}

gbc = GridSearchCV(gbc, parameters, cv = 5, n_jobs = -1, verbose = 1)
gbc.fit(x_train, y_train)

y_pred6 = gbc.predict(x_test)

acc6=accuracy_score(y_test,y_pred6)
accuracies['GradientBoosting']=acc6*100
print("Accuracy score of the model is:",accuracy_score(y_test,y_pred6)*100,"%")

In [None]:
print("Confusion matrix of the model",confusion_matrix(y_test,y_pred6))

print("Classification Report",classification_report(y_test,y_pred6))

### 2.xgboosting

- GBM 기반이나, GBM의 단점인 느린 수행시간 및 과적합 규제 부재 문제를 해결

- 병렬 CPU환경에서 병렬 학습이 가능해 속도 측면에서 우위

- 교차 검증 자체 내장

- 결손값 자체 처리

- 뛰어난 예측 성능

- 핵심 라이브러리가 C/C++로 되어 있음

In [None]:
xgb = XGBClassifier(booster = 'gblinear', learning_rate = 1, max_depth = 6, n_estimators = 10)
xgb.fit(x_train, y_train)

y_pred7 = xgb.predict(x_test)

acc7=accuracy_score(y_test,y_pred7)
accuracies['XGBClassifier']=acc7*100
print("Accuracy score of the model is:",accuracy_score(y_test,y_pred7)*100,"%")

print("Confusion matrix of the model",confusion_matrix(y_test,y_pred7))

print("Classification Report",classification_report(y_test,y_pred7))

In [None]:
print(acc7)

## 정확도 비교

In [None]:
colors = ["purple", "green", "orange","blue", "red","skyblue"]

sb.set_style("whitegrid")
plt.figure(figsize=(16,8))
plt.yticks(np.arange(0,1200,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sb.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors )
plt.show()