In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

## Data preprocessing and analysing

In [None]:
df.head()

In [None]:
df.describe()

## Checking for any skewed data and normalizing it with log transformation

Histograms of each and every feature for checking skewness and outliers


In [None]:
import matplotlib.pyplot as plt

for i in df.keys():
  df[i].hist()
  print(i)
  plt.show()

In [None]:
df.info()

## pearson correlation matrix for feature selection and found no two features are higly correlated 

correlation matrix for feature selection and feature removal but i found no big correlations between the features so i havent removed any


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,10))
sns.heatmap(df.corr(method='pearson'), annot=True)


Log transformation for normalizing the skewed data

In [None]:
df["serum_creatinine"] = np.log(df["serum_creatinine"])
df['platelets'] = np.sqrt(df['platelets'])
df['creatinine_phosphokinase'] = np.log(df['creatinine_phosphokinase'])

In [None]:
from scipy.stats import skew

print(skew(df['serum_creatinine']))
print(skew(df['platelets']))
print(skew(df['creatinine_phosphokinase'])) 

## standard scaling for better accuracy

Feature scaling for better performance 

In [None]:
features = ['age', 'creatinine_phosphokinase', 
       'ejection_fraction',  'platelets',
               'serum_sodium', 'time']
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(X_train[features])
X_test = sc.fit_transform(X_test[features])


In [None]:
X=df.drop("DEATH_EVENT",axis=1)
y=df["DEATH_EVENT"]

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.33, shuffle =True,random_state=42)

## random forest classifier for better predictions

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion= 'gini',
 max_depth= 14,
  max_features= 'log2',
   min_samples_leaf= 2,
    min_samples_split= 2,
     n_estimators= 130)
rfc.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_score 
cv_scores = cross_val_score(rfc,X,y,cv=5)


In [None]:
print(cv_scores)

## got an accuracy of 74%

In [None]:
rfc.score(X_test,y_test)

## Hyperparametre tuning for the algorithm and achieved 2% increment in the accuracy score through that

In [None]:
grid={'criterion':['gini','entropy'],
 'max_depth': [2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20],
  'max_features': ['auto', 'log2'],
   'min_samples_leaf': [1, 2, 4,6,7,8,9],
    'min_samples_split': [2, 5, 10],
     'n_estimators': [90,100,115,120,130]}
     

In [None]:

from sklearn.model_selection import RandomizedSearchCV
rf_=RandomizedSearchCV(rfc,grid,cv=3,n_jobs=-1,verbose=3)


In [None]:
rf_.fit(X_train,y_train)

In [None]:
rf_.score(X_test,y_test)

In [None]:
rf_.best_score_

In [None]:
rf_.best_params_