
<h1><center>Heart Failure Prediction using LogisticRegression, RandomForestClassifier and KNeighborsClassifier</center></h1>

<div style="width:100%;text-align: center;">
<img src="https://www.healthy-heart.org/wp-content/uploads/2019/07/xheart-rx.jpg.pagespeed.ic.AfoNyycQXP.jpg" width="800">
 </div>


# **This notebook contains ideas from other notebooks that i checked out before building my model.**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#plotting libraries
import plotly.express as px 
import matplotlib.pyplot as plt 
import seaborn as sns
#sampling library
from imblearn.over_sampling import ADASYN
#normalising data library
from sklearn.preprocessing import StandardScaler
#splitting data library
from sklearn.model_selection import train_test_split
#classifiers libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score,accuracy_score,mean_squared_error
#skewed data libraries
from scipy.stats import norm, skew, boxcox

import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df

In [None]:
df.info() #checking data types

In [None]:
df.describe() #looking for missing data

In [None]:
df.duplicated().sum() #checking for duplicated rows

In [None]:
df.isnull().sum() #checking for missing values

In [None]:
def plotting(col): #function to plot data
    plt.figure(figsize=(5, 3))
    sns.distplot(df[col])

for col in df:
    plotting(col)

In [None]:
skewed_values = pd.DataFrame({'Skewed Values': df.apply(lambda x: skew(x)).sort_values(ascending=False)}) #checking for skewed values
skewed_values

<h3><center>Skewed Data fixed with boxcox</center></h3>

In [None]:
df['creatinine_phosphokinase']=boxcox(df['creatinine_phosphokinase'])[0]
df['serum_creatinine']=boxcox(df['serum_creatinine'])[0]
for col in ['creatinine_phosphokinase','serum_creatinine']:
    plotting(col)

In [None]:
X=df.drop(["time","DEATH_EVENT"],axis=1) #dropping target column and time column which cannot be used for prediction
y=df[["DEATH_EVENT"]]

In [None]:
model = ExtraTreesClassifier() #determining feature importance
model.fit(X,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
#checking for highly collinear variables
plt.figure(figsize=(12,4))
sns.heatmap(data=df.corr().drop('DEATH_EVENT'), cmap='coolwarm', annot=True)

<h3><center>Data imbalance fixed with ADASYN</center></h3>

In [None]:
X=df[["ejection_fraction","serum_creatinine","age"]] #dropping columns to avoid overfitting
y=df[["DEATH_EVENT"]]


print(df["DEATH_EVENT"].value_counts())
labels=['dead','alive']
sizes = df["DEATH_EVENT"].value_counts()
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.pie(sizes, explode=None, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Deaths distribution")
plt.show()

resample = ADASYN(sampling_strategy='all')
X, y = resample.fit_resample(X,y)
print(y.value_counts())
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
#normalising data
scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#RandomForestClassifier
randomforest_classifier=RandomForestClassifier()
randomforest_classifier.fit(X_train,y_train)
pred=randomforest_classifier.predict(X_test)
print(accuracy_score(y_test, pred))

In [None]:
logistic_regressor = LogisticRegression() #initialising logistic regression
logistic_regressor.fit(X_train,y_train) #fitting the data
y_pred = logistic_regressor.predict(X_test) #predict the result

#the accuracy score
accuracy = accuracy_score(y_test, y_pred)
accuracy_perc = round(100*accuracy, 2)
print(f'The accuracy of the model is {accuracy_perc} %')

In [None]:
kneighbors_classifier = KNeighborsClassifier() #initialising the kneighbors algorithm
kneighbors_classifier.fit(X_train, y_train) #fitting the data
print("Training accuracy:",kneighbors_classifier.score(X_train, y_train)) 
print("Testing accuracy:",kneighbors_classifier.score(X_test, y_test))
y_pred_test = kneighbors_classifier.predict(X_test)
y_pred_train = kneighbors_classifier.predict(X_train)

<h1><center>Thanks for checking my notebook and comment if you found it helpful!</center></h1>