In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

In [None]:
data=pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
data.head()

In [None]:
col_list=list(data.columns)

In [None]:
cnt=data.DEATH_EVENT.value_counts()
plt.bar(['0','1'],cnt)

We can see there is imbalance of data. We need to consider this imbalance when using machine learning models. Thus we will use resampling

In [None]:
X=data.drop('DEATH_EVENT',axis=1)
y=data['DEATH_EVENT']

## Data Scaling

In [None]:
scale=StandardScaler()
scaled_data=pd.DataFrame(scale.fit_transform(X),columns=col_list[:-1])

We scale the data inorder to avoid bias during modelling

## Finding Correlation

In [None]:
cor=scaled_data.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(cor,vmin=-1,vmax=1,cmap='RdYlGn',annot=True)

From correlation heat map we can see there is very less correlation betwwen features of data

## Splitting data into Training and Testing

In [None]:
X_train,X_test,y_train,y_test=train_test_split(scaled_data,y,test_size=0.2)

## Machine Learnig Models

### Logistic Regression

In [None]:
logi=LogisticRegression()
logi.fit(X_train,y_train)
y_pred=logi.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

### Decision Tree

In [None]:
decision_tree=DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
y_pred=decision_tree.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

### Random Forest

In [None]:
r_forest=RandomForestClassifier()
r_forest.fit(X_train,y_train)
y_pred=r_forest.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

# Resampling Data 

In [None]:
class_0,class_1=data.DEATH_EVENT.value_counts()
df_class_0=data[data['DEATH_EVENT']==0]
df_class_1=data[data['DEATH_EVENT']==1]

In [None]:
df_class_1 =df_class_1.sample(class_0,replace=True)
df_class_1=df_class_1.reset_index(drop=True)
resampled_data=pd.concat([df_class_0,df_class_1],axis=0).reset_index()

In [None]:
cnt=resampled_data.DEATH_EVENT.value_counts()
plt.bar(['0','1'],cnt)

Here now we have balaced data

In [None]:
X=resampled_data.drop(['DEATH_EVENT','index'],axis=1)
y=resampled_data['DEATH_EVENT']

In [None]:
scale=StandardScaler()
scaled_data=pd.DataFrame(scale.fit_transform(X),columns=col_list[:-1])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(scaled_data,y,test_size=0.2)

In [None]:
logi=LogisticRegression()
logi.fit(X_train,y_train)
y_pred=logi.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

### Decision Tree

In [None]:
decision_tree=DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
y_pred=decision_tree.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

### Random Forest

In [None]:
r_forest=RandomForestClassifier()
r_forest.fit(X_train,y_train)
y_pred=r_forest.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)