#  Data Analysis and Machine Learning Predictions

In [None]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
# Rearranging the Dataset for better Plots

df['sex'].replace(1,'Male',inplace = True)
df['sex'].replace(0,'Female',inplace = True)
df['DEATH_EVENT'].replace(1,'Heart Attack',inplace = True)
df['DEATH_EVENT'].replace(0,'Alive',inplace = True)
df['anaemia'].replace(1,'Anemic',inplace = True)
df['anaemia'].replace(0,'Non-Anemic',inplace = True)
df['diabetes'].replace(1,'Diabetic',inplace = True)
df['diabetes'].replace(0,'Non-Diabetic',inplace = True)
df['smoking'].replace(1,'Smoker',inplace = True)
df['smoking'].replace(0,'Non-Smoker',inplace = True)
df['high_blood_pressure'].replace(1,'Hypertension',inplace = True)
df['high_blood_pressure'].replace(0,'Other',inplace = True)

df.rename(columns={'DEATH_EVENT':'Patient Status'}, inplace=True)
df.rename(columns={'high_blood_pressure':'High Blood Pressure'}, inplace=True)
df.rename(columns={'creatinine_phosphokinase':'Creatinine Phosphokinase'}, inplace=True)
df.rename(columns={'ejection_fraction':'Ejection Fraction'}, inplace=True)
df.rename(columns={'platelets':'Platelets'}, inplace=True)
df.rename(columns={'serum_sodium':'Serum Sodium'}, inplace=True)
df.rename(columns={'serum_creatinine':'Serum Creatinine'}, inplace=True)
df.rename(columns={'age':'Age'}, inplace=True)
df.rename(columns={'anaemia':'Anaemia'}, inplace=True)
df.rename(columns={'diabetes':'Diabetes'}, inplace=True)
df.rename(columns={'sex':'Sex'}, inplace=True)

df.head()

# Target Variable: Patient Status

In [None]:
fig1 = px.pie(df,names='Patient Status', title='Patient Status')
fig1.update_layout(title_x = 0.5)
fig1.show()

As can be seen above the target variable makes only 32.1% of the dataset. This means the dataset is highly unbalanced. But for the nature of our task an unbalanced dataset is not a problem. As the nature of the work a majority of the patients might not be susceptible to suffer a heart attack.****

# Sex and Heart Failure

In [None]:
fig = px.pie(df,names='Sex',title="Sex Distribution")
fig.show()

In [None]:
ds = df['Sex']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['Sex','Patient Status', 'Count']

fig = px.bar(dx,x="Sex",y="Count",color="Patient Status",title="Sex and Heart Attack")
fig.show()

1. 31% of male patients suffered a heart attack.
2. 32% of female patients suffered a heart attack.

# Age & Heart Failure Distribution

We begin with plotting our first feature variable “Age” with respect to “Patient Status” in a histogram. The range of “Age” is 40-95.


In [None]:
fig = px.histogram(df,x='Age',nbins=50,color='Patient Status',barmode = 'relative',title=('Age & Heart Attack Distribution'))
fig.update_layout(title_x = 0.5)
fig.show()

1. 49.35% of patients above 70 suffered a heart attack. (38 of 77)
2. 26.13% of patients below 70 suffered a heart attack.(58 of 222)

# Diabetes and Heart Failure

In [None]:
fig = px.pie(df, names='Diabetes', title="Diabetic Distribution")
fig.show()

In [None]:
fig = px.violin(df,x='Diabetes',y='Age',color="Patient Status", box=True, points="all",hover_data=df.columns,title="Effect of Diabetes & Age on Heart Failure")
fig.show()

* 60 is the median age for a patient suffering from diabetes compared to 68.5 for a non-diabetic patient to suffer a heart attack.
* As can be seen from the above plot there is a cluster diabetic patients who suffer an heart attack between the ages of 59-60.
* On the whole non-diabetic patients tend to outlive diabetic patients.

# Smoking ahd Heart Failure

In [None]:
fig = px.pie(df,names='smoking')
fig.show()

In [None]:
fig = px.violin(df,x='smoking',y='Age',color="Patient Status", box=True, points="all",hover_data=df.columns)
fig.show()

* 50% of smokers suffer a heart attack between the ages of 60-72.
* While 25% of non-smokers suffer a heart attack between 60-75.

# Anaemia and Heart Failure

In [None]:
fig1 = px.pie(df,names='Anaemia', title='Anaemia bar chart')
fig1.update_layout(title_x = 0.5)
fig1.show()

In [None]:
fig1 = px.violin(df,x='Anaemia',y='Age',color="Patient Status", box=True, points="all",hover_data=df.columns)
fig1.show()   

1. 50% of anemic patients suffered a heart attack between the ages 58-75.

# Platelets and Heart Failure

In [None]:
df.loc[df['Platelets'] <150000, 'Platelet Level'] = 'Low'
df.loc[df['Platelets'] >350000, 'Platelet Level'] = 'High'
df.loc[(df['Platelets'] < 350000) & (df['Platelets'] > 150000), 'Platelet Level'] = 'Normal'

ds = df['Platelet Level']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['Platelet Level','Patient Status', 'Count']

fig = px.bar(dx,x="Platelet Level",y='Count',color="Patient Status",barmode="group")
fig.show()

1. 30% of patients with normal platelet count suffered a heart attack.
2. 37% of patients with high platelet count suffered a heart attack.
3. 41% of patients with low platelet count suffered a heart attack.

# Serum Creatinine and Heart Failure

In [None]:
df.loc[df['Serum Creatinine'] <0.84, 'Creatinine Level'] = 'Low'
df.loc[df['Serum Creatinine'] >1.21, 'Creatinine Level'] = 'High'
df.loc[(df['Serum Creatinine'] < 1.21) & (df['Platelets'] > 0.84), 'Creatinine Level'] = 'Normal'

ds = df['Creatinine Level']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['Creatinine Level','Patient Status', 'Count']

fig = px.bar(dx,x="Creatinine Level",y='Count',color="Patient Status",barmode="group")
fig.show()

* 25.7% of patients with normal creatinine levels suffered a heart attack.
* 52.8% of patients with high creatinine levels suffered a heart attack.

# Serum Sodium and Heart Failure

In [None]:
df.loc[df['Serum Sodium'] <135, 'Sodium Level'] = 'Low'
df.loc[df['Serum Sodium'] >145, 'Sodium Level'] = 'High'
df.loc[(df['Serum Sodium']<=145) & (df['Platelets'] >= 135), 'Sodium Level'] = 'Normal'

ds = df['Sodium Level']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['Sodium Level','Patient Status', 'Count']

fig = px.bar(dx,x="Sodium Level",y='Count',color="Patient Status",barmode="group")
fig.show()


* 32% of patients with normal sodium levels suffered a heart attack.
* As can be seen above the data is too small for patients with high sodium levels to make a educated inference.

# Ejection Fraction and Heart Failure

In [None]:
df.loc[df['Ejection Fraction'] <55, 'Ejection Fraction Level'] = 'Low'
df.loc[df['Ejection Fraction'] >65, 'Ejection Fraction Level'] = 'High'
df.loc[(df['Ejection Fraction'] <=65) & (df['Ejection Fraction'] >= 55), 'Ejection Fraction Level'] = 'Normal'

ds = df['Ejection Fraction Level']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['Ejection Fraction Level','Patient Status', 'Count']

fig = px.bar(dx,x="Ejection Fraction Level",y='Count',color="Patient Status",barmode="group")
fig.show()

* 33.8% of patients with low ejection faction suffered a heart attack.
* 19% of patients with normal ejection fraction suffered a heart attack.
* As can be seen the data is too small for patients with high Ejection Fraction levels to make a educated inference.

# Blood Pressure and Heart Failure

In [None]:
fig = px.violin(df,x='High Blood Pressure',y='Age',color="Patient Status", box=True, points="all",hover_data=df.columns)
fig.show()

* 50% of smokers suffer a heart attack between the ages of 60-72.
* While 25% of non-smokers suffer a heart attack between 60-75.

#  Creatinine Phosphokinase and Heart Failure

In [None]:
df.loc[df['Creatinine Phosphokinase'] <22, 'CPK Level'] = 'Low'
df.loc[df['Creatinine Phosphokinase'] >198, 'CPK Level'] = 'High'
df.loc[(df['Creatinine Phosphokinase'] < 120) & (df['Platelets'] > 10), 'CPK Level'] = 'Normal'

ds = df['CPK Level']
ds = ds.to_frame()
ds['Patient Status'] = df['Patient Status']
dx = ds.value_counts().reset_index()
dx.columns = ['CPK Level','Patient Status', 'Count']

fig = px.bar(dx,x="CPK Level",y='Count',color="Patient Status",barmode="group")
fig.show()


* 32.7% of patients with high CPK value suffered a heart attack.
* 24.7% of patients with normal CPK value suffered a heart attack.

# Machine Learning Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import  RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,matthews_corrcoef,recall_score,precision_score

# K-means CLustering

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

kmeans = KMeans(n_clusters=2, random_state=666).fit(x)
train = x.copy()
train['cluster'] = kmeans.labels_
train['target'] = y
train
train['cluster'].value_counts()

mcc = matthews_corrcoef(train['target'], train['cluster'])
cm = confusion_matrix(train['target'], train['cluster'])

print('\n')
print('Kmeans Accuracy: ', accuracy_score(train['target'], train['cluster']))
print('Kmeans F1 Score: ', f1_score(train['target'], train['cluster']))
print('Kmeans Recall score: ', recall_score(train['target'], train['cluster']))
print('Kmeans Precision score', precision_score(train['target'], train['cluster']))
print('Kmeans MCC: ',mcc)

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

rs = RobustScaler()
x=rs.fit_transform(x)

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=0)


lr = LogisticRegression(random_state=111)
lr.fit(X_train,Y_train.values.ravel())
preds = lr.predict(X_test)

cm = confusion_matrix(Y_test, preds)
mcc = matthews_corrcoef(Y_test,preds) 

print('\n')
print('Logistic Regression Accuracy: ', accuracy_score(Y_test,preds))
print('Logistic Regression f1-score:', f1_score(Y_test, preds))
print('Logistic Regression Recall score: ', recall_score(Y_test,preds))
print('Logistic Regression f1-score', f1_score(Y_test,preds))
print('Logistic Regression MCC ',mcc)


ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Logistic Regression Confusion Matrix')


# Serial Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

rs = RobustScaler()
x =rs.fit_transform(x)

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3,random_state=0)

param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001,0.0001]}
grid = GridSearchCV(SVC(),param_grid,verbose=0)
grid.fit(X_train,Y_train.values.ravel())
 
predictions = grid.predict(X_test)   
cm = confusion_matrix(Y_test, predictions)
mcc = matthews_corrcoef(Y_test,predictions)     

#%%
 
print('\n')
print('Serial Vector Machine Accuracy: ', accuracy_score(Y_test,predictions))
print('Serial Vector Machine f1-score:', f1_score(Y_test, predictions))
print('Serial Vector Machine Recall score: ', recall_score(Y_test,predictions))
print('Serial Vector Machine f1-score', f1_score(Y_test,predictions))
print('Serial Vector Machine MCC ',mcc)


#%%

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Serial Vector Machine Confusion Matrix')

# K-Nearest Neighbours 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

rs = RobustScaler()
x =rs.fit_transform(x)

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3,random_state=0)

error_rate = []
for i in range(1,40):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train,Y_train.values.ravel())
    predictions_i= model.predict(X_test)
    error_rate.append(np.mean(predictions_i.reshape(90,1) != Y_test))

plt.figure(figsize=(20,10))
plt.xlabel("K value")
plt.ylabel("Error Rate")
plt.title("Error Rate for K-Neightbors") 
figure = plt.plot(range(1,40),error_rate,color="blue",linestyle ="dashed",marker = "o",markerfacecolor = "red",markersize =20)


In [None]:

model = KNeighborsClassifier(n_neighbors=6)
model.fit(X_train,Y_train)
predictions= model.predict(X_test)

#%%
cm = confusion_matrix(Y_test, predictions)
mcc = matthews_corrcoef(Y_test,predictions)     

print('\n')
print('KNN Accuracy:', accuracy_score(Y_test,predictions))
print('KNN f1-score: ', f1_score(Y_test, predictions))
print('KNN Precision:', precision_score(Y_test,predictions))
print('KNN Recall score: ', recall_score(Y_test, predictions))
print('KNN Mathews Coefficient: ',mcc)
print('\n')

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('KNN Confusion Matrix')

# Decision Trees and Random Forests

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 

x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

rs = RobustScaler()
x =rs.fit_transform(x)

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3, random_state=103)
dtree = DecisionTreeClassifier()
dtree.fit(X_train,Y_train)

predictions = dtree.predict(X_test)

cm = confusion_matrix(Y_test, predictions)
mcc = matthews_corrcoef(Y_test,predictions)      

print('\n')
print('Decision Tree Accuracy:', accuracy_score(Y_test,predictions))
print('Decision Tree f1-score: ', f1_score(Y_test, predictions))
print('Decision Tree Precision:', precision_score(Y_test,predictions))
print('Decision Tree Recall score: ', recall_score(Y_test, predictions))
print('Decision Tree Mathews Coefficient: ',mcc)
print('\n')

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Decision Tree Confusion Matrix')


Random Forests

Choosing Number oF Decision Trees for Random Forests

In [None]:
error_rate = []
for i in range(1,1000,50):
    model = RandomForestClassifier(n_estimators=i)
    model.fit(X_train,Y_train.values.ravel())
    predictions_i= model.predict(X_test)
    error_rate.append(np.mean(predictions_i.reshape(90,1) != Y_test))

plt.figure(figsize=(20,10))
plt.xlabel("N- Estimators")
plt.ylabel("Error Rate")
plt.title("Error Rate for N-Estimators") 
figure = plt.plot(range(1,1000,50),error_rate,color="blue",linestyle ="dashed",marker = "o",markerfacecolor = "red",markersize =20)


In [None]:
rfc = RandomForestClassifier(n_estimators = 400)
rfc.fit(X_train,Y_train.values.ravel())
rfc_predictions = rfc.predict(X_test)

cm = confusion_matrix(Y_test, rfc_predictions)
mcc = matthews_corrcoef(Y_test,rfc_predictions)      
print('\n')
print('Random Forest Classifier ', accuracy_score(Y_test,rfc_predictions))
print('Random Forest Classifier Precision:', precision_score(Y_test,rfc_predictions))
print('Random Forest Classifier Recall score: ', recall_score(Y_test, rfc_predictions))
print('Random Forest Classifier f1-score', f1_score(Y_test, rfc_predictions))
print('Random Forest Classifier MCC ',mcc)


#%%
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Random Forest Confusion Matrix')

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

#%%  Data Set 

x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking',]]


y = df[['DEATH_EVENT']]

rs = RobustScaler()
x =rs.fit_transform(x)

ratio = 95/ (95+299)
weights = [ratio, 1.0 - ratio]


X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3, random_state=103)

model = Sequential()
model.add(Dense(11,activation="sigmoid"))
model.add(Dense(6,activation="sigmoid"))
model.add(Dense(1))
model.compile(optimizer='rmsprop',loss = "binary_crossentropy",metrics=["BinaryAccuracy"],loss_weights=weights)
history = model.fit(x=X_train,y=Y_train,epochs=450)

predictions = model.predict_classes(X_test)    

#%%

cm = confusion_matrix(Y_test, predictions)
mcc = matthews_corrcoef(Y_test,predictions)      
print('\n')
print('Neural Network Accuracy: ', accuracy_score(Y_test,predictions))
print('Neural Network f1-score:', f1_score(Y_test, predictions))
print('Neural Network Recall score: ', recall_score(Y_test,predictions))
print('Neural Network f1-score', f1_score(Y_test,predictions))
print('Neural Network MCC ',mcc)
#%%

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Neural Network Confusion Matrix')