In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
heart_full = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

heart_full

<h1> Exploratory Data Analysis </h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

<h2> Imbalance </h2>

In [None]:
counts = heart_full['DEATH_EVENT'].value_counts()

counts

In [None]:
counts[1] / counts[0] 

Observation:

1. The Data is heavily Imbalanced. Need SMOTE because it is small already
2. SMOTE only on the training data

<h2> Distributions </h2>

In [None]:
heart_died = heart_full[heart_full['DEATH_EVENT'] == 1]
heart_lived = heart_full[heart_full['DEATH_EVENT'] == 0]

In [None]:
heart_died

In [None]:
heart_died.describe()

<h3> What is the Distribution of Age? </h3>

In [None]:
sns.distplot(heart_died['age'])
sns.distplot(heart_lived['age'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. The Distribution is somewhat Normal with some skewness to the right a bit.
2. Somewhat skewed to the right. Try BoxCox
3. Need Standardization and Scaling

<h3> Distribution of creatinine_phosphokinase </h3>

In [None]:
sns.distplot(heart_died['creatinine_phosphokinase'])
sns.distplot(heart_lived['creatinine_phosphokinase']. sample(96))

plt.legend(['died', 'lived'])

Obeservations:

1. Highly Skewed like a Chi-Square Distribution
2. Need BoxCox Transformation. Variance is very different between the two. according to shape of curve
3. Need Standardization and Scaling

<h3> Distribution of ejection fraction </h3>

In [None]:
sns.distplot(heart_died['ejection_fraction'])
sns.distplot(heart_lived['ejection_fraction'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. Distribution is Bimodal
2. Need QuantileTransformation to normal
3. Need Standardization and Scaler
4. Need more observation of the Split between death and new

<h3> Distribution of Platelets </h3>

In [None]:
sns.distplot(heart_died['platelets'])
sns.distplot(heart_lived['platelets'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. Fairly Normal, but there are outliers
2. No need for BoxCox, but trimming needed
3. Need Standardization and Scaling

<h3> Distribution of Serum Creatinine </h3>

In [None]:
sns.distplot(heart_died['serum_creatinine'])
sns.distplot(heart_lived['serum_creatinine'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. Looks like Chi-Square with a lot of outliers.
2. Need BoxCox and Trimming
3. Need Scaling and Scaling
4. People who died tend to have less serum_creatinine?

<h3> Distribution of Serum Sodium </h3>

In [None]:
sns.distplot(heart_died['serum_sodium'])
sns.distplot(heart_lived['serum_sodium'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. Outliers Exist or can be interpreted as a skewed to the left
2. Need BoxCox
3. Need Standardization and Scaling

<h3> Distribution of time </h3>

In [None]:
sns.distplot(heart_died['time'])
sns.distplot(heart_lived['time'].sample(96))

plt.legend(['died', 'lived'])

Observations:

1. BiModal and skewed to the left
2. Need Quantile and/or boxcox?
3. Need Standardization and Scaling

<h3> Anaemia Counts </h3>

In [None]:
sns.distplot(heart_lived['diabetes'].sample(96), kde=False)
sns.distplot(heart_died['diabetes'], kde=False)

plt.legend(['lived', 'died'])

Observations:

1. By Randomly Sampling, we could give a fair comparison
2. There seems to be little variation between the people with and without anaemia
3. Suggestion: Remove Anaemia

<h3> Diabetes Counts </h3>

In [None]:
'''
fig, ax = plt.subplots(1,2)

ax[0].title.set_text('Diabetes Count: Died')
ax[0].set_xlim()

sns.countplot(x='diabetes', data=heart_died, ax=ax[0])

ax[1].title.set_text('Diabetes Count: Lived')

sns.countplot(x='diabetes', data=heart_lived, ax=ax[1])
'''

sns.distplot(heart_lived['diabetes'].sample(96), kde=False)
sns.distplot(heart_died['diabetes'], kde=False)


plt.legend(['lived', 'died'])

Observations:

1. By Randomly Sampling the majority class, we can more or less get a fair comparison
2. There seems to be very little variation between those people who had diabetes and those without diabetes
3. Suggestion: Remove Diabetes.

<h3> High Blood Pressure Counts </h3>

In [None]:
sns.distplot(heart_lived['high_blood_pressure'].sample(96), kde=False)
sns.distplot(heart_died['high_blood_pressure'], kde=False)


plt.legend(['lived', 'died'])

Observation:

1. It seems that more people with high blood pressure died. 
2. Keep High_Blood_pressure

<h3> Sex Counts </h3>

In [None]:
sns.distplot(heart_lived['sex'].sample(96), kde=False)
sns.distplot(heart_died['sex'], kde=False)


plt.legend(['lived', 'died'])

Observations:

1. There is a difference in the change, but very small. Nonetheless could be valuable
2. More women(?) Died than Men
3. Keep unless needed to remove

<h3> Smoking Counts </h3>

In [None]:
sns.distplot(heart_lived['smoking'].sample(96), kde=False)
sns.distplot(heart_died['smoking'], kde=False)


plt.legend(['lived', 'died'])

Observations:

1. People who smoked lived more
2. Keep 

<h2> Correlations </h2>

In [None]:
temp_heart = heart_full.drop(['anaemia', 'diabetes'], axis=1)

fig = plt.figure(figsize=(12,8))

heart_corr = temp_heart.corr()

sns.heatmap(heart_corr, annot=True)

Observations:

1. Smoking is highly correlated with sex. Drop Sex.

<h2> Summary Statistics </h2>

In [None]:
temp_heart.describe()

<h2> Summary: </h2>

<h4> Imbalance: </h4>

1. There are more people that lived than died
2. Heavily Imbalanced
3. SMOTE on the Training Data

<h4> Distributions: </h4>

BoxCox:

1. age
2. creatinine_phosphokinase
3. serum_creatinine
4. serum_sodium

Quantile:

1. ejection_fraction
2. time

Trimming:

1. platelets

Drop:

1. anaemia
2. diabetes

<h4> Correlation: </h4>

Drop:

1. Sex


<h1> Preprocessing </h1>

In [None]:
from sklearn.preprocessing import (
    PowerTransformer,
    StandardScaler,
    RobustScaler,
    QuantileTransformer
)

In [None]:
heart_preprocessed = heart_full.drop(['sex', 'anaemia', 'diabetes'], axis=1)

heart_preprocessed

<h2> Distributions </h2>

In [None]:
transformers_1 = {
    'age': StandardScaler(),
    'creatinine_phosphokinase': StandardScaler(),
    'ejection_fraction': StandardScaler(),
    'platelets': StandardScaler(),
    'serum_creatinine': StandardScaler(),
    'serum_sodium': StandardScaler(),
    'time': StandardScaler()
}

transformers_2 = {
    'age': RobustScaler(unit_variance=True),
    'creatinine_phosphokinase': RobustScaler(unit_variance=True),
    'ejection_fraction': RobustScaler(unit_variance=True),
    'platelets': RobustScaler(unit_variance=True),
    'serum_creatinine': RobustScaler(unit_variance=True),
    'serum_sodium': RobustScaler(unit_variance=True),
    'time': RobustScaler(unit_variance=True)
}

transformers_3 = {
    'age': PowerTransformer(method='yeo-johnson'),
    'creatinine_phosphokinase': QuantileTransformer(output_distribution='normal'),
    'ejection_fraction': QuantileTransformer(output_distribution='normal'), #Also need
    'serum_creatinine': PowerTransformer(method='yeo-johnson'),
    'serum_sodium': PowerTransformer(method='yeo-johnson'),
    'time': QuantileTransformer(output_distribution='normal')
}

for feature, transformer in transformers_1.items():
    heart_preprocessed[feature] = transformer.fit_transform(heart_preprocessed[feature].values.reshape(-1,1))
    
for feature, transformer in transformers_2.items():
    heart_preprocessed[feature] = transformer.fit_transform(heart_preprocessed[feature].values.reshape(-1,1))
    
for feature, transformer in transformers_3.items():
    heart_preprocessed[feature] = transformer.fit_transform(heart_preprocessed[feature].values.reshape(-1,1))

heart_preprocessed

<h3> Testing Distributions </h3>

In [None]:
heart_pp_lived = heart_preprocessed[heart_preprocessed['DEATH_EVENT'] == 0]
heart_pp_died = heart_preprocessed[heart_preprocessed['DEATH_EVENT'] == 1]

In [None]:
sns.distplot(heart_pp_lived['age'].sample(96))
sns.distplot(heart_pp_died['age'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['creatinine_phosphokinase'].sample(96))
sns.distplot(heart_pp_died['creatinine_phosphokinase'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['ejection_fraction'].sample(96))
sns.distplot(heart_pp_died['ejection_fraction'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['platelets'].sample(96))
sns.distplot(heart_pp_died['platelets'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['serum_creatinine'].sample(96))
sns.distplot(heart_pp_died['serum_creatinine'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['serum_sodium'].sample(96))
sns.distplot(heart_pp_died['serum_sodium'])

plt.legend(['lived', 'died'])

In [None]:
sns.distplot(heart_pp_lived['time'].sample(96))
sns.distplot(heart_pp_died['time'])

plt.legend(['lived', 'died'])

<h2> Setting Outliers to mean</h2>

In [None]:
outliers = {
    'platelets': [pd.concat([heart_preprocessed[heart_preprocessed['platelets'] > 4], heart_preprocessed[heart_preprocessed['platelets'] < -4]]).index],
    'creatinine_phosphokinase': [pd.concat([heart_preprocessed[heart_preprocessed['creatinine_phosphokinase'] > 4], heart_preprocessed[heart_preprocessed['creatinine_phosphokinase'] < -4]]).index],
    'ejection_fraction': [pd.concat([heart_preprocessed[heart_preprocessed['ejection_fraction'] > 4], heart_preprocessed[heart_preprocessed['ejection_fraction'] < -4]]).index],
    'time': [pd.concat([heart_preprocessed[heart_preprocessed['time'] > 4], heart_preprocessed[heart_preprocessed['time'] < -4]]).index]
}

for feature, indices in outliers.items():
    for i in indices:
        heart_preprocessed.at[i, feature] = 0

heart_preprocessed

In [None]:
heart_preprocessed.reset_index(drop=True, inplace=True)

heart_preprocessed

In [None]:
from sklearn.preprocessing import MinMaxScaler

num = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time'
]

mmsc = MinMaxScaler()

heart_preprocessed[num] = mmsc.fit_transform(heart_preprocessed[num])

heart_preprocessed

<h1> Modelling </h1>

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report

<h2> Training with CatBoost </h2>

In [None]:
X = heart_preprocessed.drop('DEATH_EVENT', axis=1)

y = heart_preprocessed['DEATH_EVENT']

skf = StratifiedKFold(shuffle=True, random_state=1)

model = CatBoostClassifier(
    n_estimators=2000,
    early_stopping_rounds=5,
    random_state=0,
    eval_metric='Accuracy',
    thread_count=-1,
    bootstrap_type='Bayesian',
    bagging_temperature=3,
    verbose=False
)

cvs_acc = cross_val_score(model, X, y, scoring='accuracy', cv=skf, n_jobs=-1)

print('cvs acc: ', cvs_acc.mean())

<h1> Version 1 Accuracy: 86% </h1>