# Classification_HeartFailure

## 1_1. Check_Dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#read dataset using pandas(read_csv)
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
#check data
print('count : ',len(df))
print(df.head(),end='\n\n')
print('----------------------------------------------------------------',end='\n\n')
print(df.info())
print('check null value using df.info()')

In [None]:
df.columns

In [None]:
#Check the data correlation. using seaborn library(histplot,jointplot,boxplot)
sns.histplot(data=df,x='age',hue='DEATH_EVENT',kde=True)

In [None]:
sns.histplot(data=df, x ='platelets', hue ="DEATH_EVENT",kde=True,bins=15)

In [None]:
sns.boxplot(data=df,x='DEATH_EVENT',y="ejection_fraction")

In [None]:
sns.boxplot(data=df,x='smoking',y="ejection_fraction")

In [None]:
sns.boxplot(data=df,x='anaemia',y="ejection_fraction")

## 1_2. DataPreprocessing

#### Because there are two types of data, a preprocessing process is required.
- type one : numeric data, type two : categorical data

In [None]:
#check df.columns
df.columns

In [None]:
df.head()

In [None]:
#Time was not included. This is because time affects DEATH_EVENT.
X_num = df[['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']]
X_cat = df[['anaemia','diabetes','high_blood_pressure','sex','smoking']]
y = df['DEATH_EVENT']

In [None]:
#Data preprocessing was done using sklearn
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
# The data type of X_scaled was changed to numpy. Let's return to the pd type
X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)

#input data X is X_num + X_cat
X = pd.concat([X_scaled,X_cat],axis=1)

In [None]:
#Let check X
X

## 1_3 Split_Train_Test_Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)
len(X_train),len(X_test),len(y_train),len(y_test)

## 1_4 Classification_Model_Learning && 1_5 Evaluate_Model_Precision

In [None]:
#first model is Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
model_lr = LogisticRegression()
model_lr.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
pred = model_lr.predict(X_test)
print(classification_report(y_test,pred))

In [None]:
## We tried the xgboost model, which is recording good performance in the classification.
from xgboost import XGBClassifier

In [None]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)

In [None]:
pred = model_xgb.predict(X_test)
print(classification_report(y_test,pred))

## 1_6 Check_Result

In [None]:
#in XGBClassifier can check the importancy by using fearture_importances_

plt.bar(X.columns,model_xgb.feature_importances_)
plt.xticks(rotation=90)
plt.show()

print('- in XGBClassifier serum_creatinine main Element related to Death')

## 1_7 Check_Precision_Recall curve

In [None]:
from sklearn.metrics import plot_precision_recall_curve

In [None]:
fig = plt.figure()
ax = fig.gca()
plot_precision_recall_curve(model_lr,X_test,y_test,ax=ax)
plot_precision_recall_curve(model_xgb,X_test,y_test,ax=ax)

In [None]:
#check the ROC curve
from sklearn.metrics import plot_roc_curve

In [None]:
# 두 모델의 ROC 커브를 한번에 그리기 (힌트: fig.gca()로 ax를 반환받아 사용)

fig = plt.figure()
ax = fig.gca()
plot_roc_curve(model_lr,X_test,y_test,ax=ax)
plot_roc_curve(model_xgb,X_test,y_test,ax=ax)

# Thank you for watching

#### if you need more information about AI check my github site (https://github.com/SongHunHan)