In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

# DATA LOADING AND BASIC UNDERSTANDING OF DATA

In [None]:
data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data

In [None]:
data.isnull().sum()

**No Null values in any of the columns**

**Basic Statistical description of dataset:**

In [None]:
data.describe()

**Correlation Heatmap**

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

As we can see that none of the features are highly correlated (not greater than 0.5) we cannot remove any columns.

# UNIVARIATE ANALYSIS OF CONTINUOUS VARIABLES

In [None]:
sns.distplot(x=data['age'])

In [None]:
sns.kdeplot(
   data=data, x="creatinine_phosphokinase", hue="DEATH_EVENT",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=data, x="platelets", hue="DEATH_EVENT",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=data, x="serum_creatinine", hue="DEATH_EVENT",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=data, x="serum_sodium", hue="DEATH_EVENT",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=data, x="time", hue="DEATH_EVENT",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

**From the above plots on continuous variables, we see that most of them are having binomial distribution but we need to check skewness as well because presence of skewness can affect model's performance**

# UNIVARIATE ANALYSIS OF CATEGORICAL VARIABLES

In [None]:
sns.countplot(data['anaemia'])

In [None]:
sns.countplot(data['diabetes'])

In [None]:
sns.countplot(data['high_blood_pressure'])

In [None]:
sns.countplot(data['sex'])

In [None]:
sns.countplot(data['smoking'])

**Now let's check skewness of continuous variables:**

In [None]:
from scipy.stats import skew
print(skew(data['age']))
print(skew(data['serum_sodium']))
print(skew(data['serum_creatinine']))
print(skew(data['platelets']))
print(skew(data['time'])) 
print(skew(data['creatinine_phosphokinase'])) 
print(skew(data['ejection_fraction'])) 

**IF A COLUMN IN THE DATASET IS SKEWED THEN IT AFFECTS THE MODEL'S PERFORMANCE AND WE MIGHT END UP WITH WRONG PREDICTIONS. SKEWNESS IS THE MEASURE OF ASSYMETRY OF PROBABILITY DISTRIBUTION OF RANDOM VARIABLE ABOUT IT'S MEAN. WE TAKE INTO ACCOUNT THE FOLLOWING POINTS:

If skewness is 0, the data are perfectly symmetrical
If skewness is less than -1 or greater than 1, the distribution is highly skewed.
If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.**

**As we can see that "serum_creatinine", "platelets" and creatinine_phosphokinase" has values which indicate skewness, we need to treat them.**

In [None]:
data["serum_creatinine"] = np.log(data["serum_creatinine"])
data['platelets'] = np.sqrt(data['platelets'])
data['creatinine_phosphokinase'] = np.log(data['creatinine_phosphokinase'])

In [None]:
print(skew(data['serum_creatinine']))
print(skew(data['platelets']))
print(skew(data['creatinine_phosphokinase'])) 

So we see that skewness is removed..

**Let's check the target variable**

In [None]:
sns.countplot(data['DEATH_EVENT'])

**The target variable is imbalanced and we need to treat it before applying any algorithm**

# RELATIONSHIP BETWEEN VARIABLES

In [None]:
sns.catplot(x='diabetes', y="age", hue = 'DEATH_EVENT', kind="box", data=data)

**In the provided dataset there are people with diabetes**

In [None]:
sns.catplot(x='high_blood_pressure', y="serum_sodium",hue = 'DEATH_EVENT', kind="box", data=data)

**There has been found strong relationship between high blood pressure and sodium level in blood in medical research. Normal range of values of serum sodium is 135 - 145(mEq/L). Since the values provided in dataset is within the range, in the plot above we see more number of people not having high blood pressure and death event is also comparitively less.
Thus golden rule:- control Salt(Sodium) intake and protect your heart**

In [None]:
sns.catplot(x='diabetes', y="creatinine_phosphokinase",hue = 'DEATH_EVENT', kind="box", data=data)

**Creatine phosphokinase (CPK) is an enzyme in the body. It is found mainly in the heart, brain, and skeletal muscle.In diabetic patients attending our clinic, elevated CK levels occur in one-fifth of the cases. Thus it does not have much effect on target**

In [None]:
sns.catplot(x='DEATH_EVENT', y="serum_creatinine", kind="box", data=data)

**Elevated creatinine marks damage to kidney which in turn affects normal function of other organs. Thus Death event is more in the cases of elevated creatinine levels**

In [None]:
sns.catplot(x='DEATH_EVENT', y="ejection_fraction", kind="box", data=data)

**Normal ejection fraction is between 50% to 75%, so low ejection fraction leads to more death**

In [None]:
sns.catplot(x='high_blood_pressure', y="ejection_fraction", hue="DEATH_EVENT", kind="box", data=data)

**high BP though increases ejection fraction but long term effect of high BP increases chances of death as well**

In [None]:
sns.catplot(x='high_blood_pressure', y="serum_creatinine",hue = 'DEATH_EVENT', kind="box", data=data)

**Presence of high Blood pressure affects other organs as well like kidney and increases creatinine**

In [None]:
sns.catplot(x='anaemia', y="age",hue = 'DEATH_EVENT', kind="box", data=data)

**There is no clear relationship between age and anaemia leading to death**

In [None]:
sns.catplot(x='anaemia', y="ejection_fraction",hue = 'DEATH_EVENT', kind="box", data=data)

**Anaemia can effect ejection fraction**

In [None]:
sns.catplot(x='smoking', y="ejection_fraction",hue = 'DEATH_EVENT', kind="box", data=data)

**Smoking causes decrease in ejection fraction**

In [None]:
sns.lineplot(data= data, x='serum_creatinine',y='platelets', color='goldenrod')

**increase in creatinine can lead to decrease in platelets**

In [None]:
sns.lineplot(data= data, x='creatinine_phosphokinase',y='platelets', color='goldenrod')

In [None]:
sns.lineplot(data= data, x='creatinine_phosphokinase',y='serum_creatinine', color='goldenrod')

**creatinine phosphokinase increase does not alter creatinine value much as healthy kidney filters it out**

From the above analysis following insights can be drawn:
1. High Blood Pressure has not Much relation with increase in serum_sodium in the given dataset but research shows that high sodium level can increase BP.
2. Smoking must be stopped as it affects ejection fraction 
3. High Levels of serum creatinine which arise due to high BP increase death**

# MODEL BUILDING

In [None]:
data.columns

In [None]:
features = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']
label = ['DEATH_EVENT']
X=data[features]
y=data[label]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.33, shuffle =True,random_state=42)

**APPLYING STANDARD SCALER TO VARIABLES WHICH REQURE**

In [None]:
features1 = ['age', 'creatinine_phosphokinase', 
       'ejection_fraction',  'platelets',
        'serum_sodium', 'time']

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(X_train[features1])
X_test = sc.transform(X_test[features1])

**LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
y_pred_log_reg = log_reg.predict(X_test)
y_pred_log_reg

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_pred_log_reg)
print(classification_report)

**F1 scores show that the model is performing quite well. Moreover precision of class 1 that is prediction of positive class is high which is the most importatnt thing in this dataset. So it's good.**

In [None]:
print(confusion_matrix(y_test, y_pred_log_reg))

In [None]:
auc = roc_auc_score(y_test, y_pred_log_reg)
auc

**AUC Score indicates that model is able to classify the classes well**

**AN AUC SCORE OF 0.786 WITH PRECISION OF AROUND 0.87 IS QUITE GOOD BUT CAN WE IMPROVE OUR MODEL? LET'S SEE**

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc_predict = rfc.predict(X_test)
roc_auc_score(y_test, rfc_predict)

In [None]:
cm = confusion_matrix(y_test, rfc_predict)
cm

In [None]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)


**Both AUC score and f1 score is low as compared to Logistic regression**

**XGB CLASSIFIER**

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_pred1 = model.predict(X_test)
roc_auc_score(y_test, y_pred1)

In [None]:
accuracy = accuracy_score(y_test, y_pred1)
print(accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred1)
cm

In [None]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)
print(precision)
print(recall)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0, criterion='entropy')
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_f1 = f1_score(y_test, dt_pred)
dt_f1

In [None]:
gradientboost_clf = GradientBoostingClassifier(max_depth=2, random_state=1)
gradientboost_clf.fit(X_train,y_train)
gradientboost_pred = gradientboost_clf.predict(X_test)
gradientboost_f1 = f1_score(y_test, gradientboost_pred)
gradientboost_f1

In [None]:
from sklearn.metrics import precision_score
lgb_clf = lightgbm.LGBMClassifier(max_depth=2, random_state=4)
lgb_clf.fit(X_train,y_train)
lgb_pred = lgb_clf.predict(X_test)
lgb_f1 = f1_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_f1

In [None]:
model = lightgbm.LGBMClassifier(random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('LightGBM f1-score', f1_score(y_test, preds))
print('LightGBM precision', precision_score(y_test, preds))
print('LightGBM recall', recall_score(y_test, preds))

In [None]:
model = xgb.XGBClassifier(random_state=666)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('XGBClassifier f1-score', f1_score(y_test, preds))
print('XGBClassifier precision', precision_score(y_test, preds))
print('XGBClassifier recall', recall_score(y_test, preds))

In [None]:
cat_clf = CatBoostClassifier()
cat_clf.fit(X_train,y_train)
cat_pred = cat_clf.predict(X_test)
cat_f1 = f1_score(y_test, cat_pred)
cat_f1

**From all the models used, we see that Logistic Regression is performing the best!!**

# If you found this notebook helpful please do not forget to UPVOTE...