In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')
data.head()

### First, We seek the data

In [None]:
data.info()

#### OK, we don't have any columns which have nulls!

In [None]:
target = 'HeartDisease'
print(data[target].value_counts())
sns.countplot(x=target, data=data)

In [None]:
# to see all correlation
sns.pairplot(data, hue=target)

In [None]:
# check the number 
corr = data.corr()
sns.heatmap(corr, annot=True)

Each col hasn't the high corr with others.  
So, we don't have to the multico for now.

In [None]:
# we have to replace the object col to numeric col
var = 'Sex'
data[var].value_counts()

This is so biased in Sex.
we try to check the each sex to find any keys

In [None]:
data_query = data.query('Sex == "M"')
sns.countplot(x=target, data=data_query)

In [None]:
# to see all correlation
sns.pairplot(data_query, hue=target)

In [None]:
# check the number 
corr_M = data_query.corr()
sns.heatmap(corr_M, annot=True)

OK..  
We just know Male tends to be sick more than F.

#### Let's move F

In [None]:
data_query = data.query('Sex == "F"')
sns.countplot(x=target, data=data_query)

#### Comparing to M, Female isn't be sick.

In [None]:
# to see all correlation
sns.pairplot(data_query, hue=target)

In [None]:
# check the number 
corr_F = data_query.corr()
sns.heatmap(corr_F, annot=True)

In [None]:
le = LabelEncoder()

# Encoding the obj columns
data['Sex'] = data['Sex'].replace({'M': 1, 'F': 0})
data['ExerciseAngina'] = data['ExerciseAngina'].replace({'Y': 1, 'N': 0})
data['ST_Slope'] = le.fit_transform(data['ST_Slope'])

In [None]:
data.head()

#### Else, we use pandas' get_dummies

In [None]:
data_encoded = pd.get_dummies(data, drop_first=True)
data_encoded.head()

In [None]:
data_encoded.info()

### OK, we've done preparing.
#### go analysis the data

In [None]:
scaler = StandardScaler()

X = data_encoded.drop('HeartDisease', axis=1).values
y = data_encoded['HeartDisease'].values

# scaling X
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [None]:
model = RandomForestClassifier(n_estimators=200, random_state=123, criterion='entropy')

scores = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
np.mean(scores)

In [None]:
kfold = KFold(n_splits=10).split(X_train, y_train)

scores = []
for train, test in kfold:
    model.fit(X_train[train], y_train[train])
    score = model.score(X_train[test], y_train[test])
    
    scores.append(score)

print(np.mean(scores))

In [None]:
model.score(X_valid, y_valid)

### I think this model isn't over fitting.

In [None]:
col_importance_dict = {}
for col, importance in zip(data_encoded.columns, model.feature_importances_):
    col_importance_dict[col] = importance
    
col_importance_dict

#### ChestPain something's  importances don't look necessary??

In [None]:
data_copy = data.copy()
data_copy = data_copy.drop(['ChestPainType', 'RestingECG'], axis=1)
data_copy_encoded = pd.get_dummies(data_copy, drop_first=True)

In [None]:
scaler = StandardScaler()

X = data_copy_encoded.drop('HeartDisease', axis=1).values
y = data_copy_encoded['HeartDisease'].values

# scaling X
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

In [None]:
model = RandomForestClassifier(n_estimators=200, random_state=123, criterion='entropy')

scores = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print('Score is ', np.mean(scores))

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [None]:
kfold = KFold(n_splits=10).split(X_train, y_train)

scores = []
for train, test in kfold:
    model.fit(X_train[train], y_train[train])
    score = model.score(X_train[test], y_train[test])
    
    scores.append(score)

print('Train score is ', np.mean(scores))

print('Valid score is ', model.score(X_valid, y_valid))

In [None]:
# we seach how much importances each col has
col_importance_dict = {}
for col, importance in zip(data_encoded.columns, model.feature_importances_):
    col_importance_dict[col] = importance
    
col_importance_dict

In [None]:
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_valid = xgb.DMatrix(X_valid, label=y_valid)

xbg_param = {
   # predict 0 or 1
   'objective': 'binary:logistic',
   'max_depth':1,
   'eval_metric': 'logloss'
}

bst = xgb.train(xbg_param, xgb_train, num_boost_round=150)
preds = bst.predict(xgb_valid)

y_pred = np.where(preds > 0.5, 1, 0)

print(accuracy_score(y_true=y_valid, y_pred=y_pred))
print(f1_score(y_true=y_valid, y_pred=y_pred))

#### This time, Random forest is better than xgb.
#### So, we search hyperparameters

In [None]:
param_dist = {
    'n_estimators': [100, 150, 200], 
    'criterion':['gini', 'entropy'], 
    'max_depth':[1, 5, None], 
    'max_features': ['auto', 'sqrt', 'log2']
}

model = RandomForestClassifier(random_state=123)

clf = RandomizedSearchCV(model, param_dist, cv=10)
search = clf.fit(X_train, y_train)
search.score(X_train, y_train)

In [None]:
search.score(X_valid, y_valid)

In [None]:
model_bst = search.best_estimator_
y_pred = model_bst.predict(X_train)

roc_auc_score(y_true=y_train, y_score=y_pred)

In [None]:
y_pred_valid = search.predict(X_valid)
roc_auc_score(y_true=y_valid, y_score=y_pred_valid)

### I think this model isn't over fitting 

In [None]:
search.best_params_

In [None]:
model_bst.score(X_test, y_test)

### OK, that's not enough acccuracy for me, haha. of course, we didn't use lightGBM,  simple logistics, adaboost, or someting
### and we didn't try parameter engineering enough.
#### So, if you think the better way to get more accuracy, feel free to let me know!!