In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
##reading data
data=pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
##data shape
data.shape

In [None]:
##null value percentage check
round(100*(data.isnull().sum()/data.shape[0]),2)

In [None]:
data.describe()

In [None]:
##checking target count
data['DEATH_EVENT'].value_counts()

### so we have a class imbalance problem here
### let's do some visualization also to better understand the data

In [None]:
sns.countplot(data['DEATH_EVENT'])
plt.show()

In [None]:
data.columns

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(),annot=True)

In [None]:
sns.boxplot(data['DEATH_EVENT'],data['age'])
plt.show()

In [None]:
sns.boxplot(data['DEATH_EVENT'],data['serum_creatinine'])
plt.show()

In [None]:
sns.distplot(data['serum_creatinine'])

In [None]:
sns.boxplot(data['serum_creatinine'])

In [None]:
data['serum_creatinine'].describe()

In [None]:
sns.distplot(np.log(data['serum_creatinine']))

In [None]:
data['serum_creatinine']=np.log(data['serum_creatinine'])

In [None]:
data.shape

In [None]:
sns.boxplot(data['DEATH_EVENT'],data['serum_creatinine'],hue=data['sex'])

In [None]:
sns.boxplot(data['DEATH_EVENT'],data['serum_creatinine'],hue=data['smoking'])

In [None]:
sns.pairplot(data)

In [None]:
sns.scatterplot(data['creatinine_phosphokinase'],data['platelets'])

In [None]:
##X,y split
X=data.drop('DEATH_EVENT',axis=1)

In [None]:
X.shape

In [None]:
y=data['DEATH_EVENT']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=42,shuffle=True)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
y_train.value_counts() , y_test.value_counts()

## lets handle class imbalance

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE
smk=SMOTE(random_state=42)
X_train_res,y_train_res=smk.fit_resample(X_train,y_train)

In [None]:
y_train.value_counts()

In [None]:
y_train_res.value_counts()

In [None]:
##scaling
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train_res)

In [None]:
X_test=scaler.transform(X_test)

In [None]:
X_train.shape,X_test.shape,y_train_res.shape

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log1=LogisticRegression()

In [None]:
log1.fit(X_train,y_train_res)

In [None]:
y_test_pred=log1.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
metrics.accuracy_score(y_test,y_test_pred)

In [None]:
metrics.confusion_matrix(y_test,y_test_pred)

In [None]:
metrics.roc_auc_score(y_test,y_test_pred)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test_pred,y_test))

## DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc=DecisionTreeClassifier(max_depth=5,min_samples_leaf=25)

In [None]:
dtc.fit(X_train,y_train_res)

In [None]:
y_test_pred=dtc.predict(X_test)

In [None]:
metrics.accuracy_score(y_test_pred,y_test)

In [None]:
metrics.confusion_matrix(y_test_pred,y_test)

In [None]:
print(classification_report(y_test_pred,y_test))

In [None]:
metrics.roc_auc_score(y_test_pred,y_test)

## Hyperparameter Tuning

In [None]:
## Hyper parameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
dt=DecisionTreeClassifier(random_state=42)

In [None]:
params={'max_depth': [3,5,7,9,12],'min_samples_leaf': [10,25,30,60,90],'criterion': ["gini", "entropy"]}

In [None]:
cv_model=GridSearchCV(estimator=dt,param_grid=params,cv=5,n_jobs=-1,verbose=1,scoring='accuracy')

In [None]:
%%time
cv_model.fit(X_train,y_train_res)

In [None]:
cv_model.best_estimator_

In [None]:
cv_model.best_params_

In [None]:
dtc2=DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=25,
                       random_state=42)

In [None]:
dtc2.fit(X_train,y_train_res)

In [None]:
y_test_pred=dtc2.predict(X_test)

In [None]:
metrics.accuracy_score(y_test,y_test_pred)

In [None]:
metrics.confusion_matrix(y_test,y_test_pred)

In [None]:
metrics.roc_auc_score(y_test,y_test_pred)

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier(n_estimators=200,max_depth=3,min_samples_leaf=25)

In [None]:
rfc.fit(X_train,y_train_res)

In [None]:
y_test_pred=rfc.predict(X_test)

In [None]:
metrics.accuracy_score(y_test_pred,y_test)

## Hyperparameter Tuning

In [None]:
rf=RandomForestClassifier(random_state=42)

In [None]:
params={'max_depth':[3,6,9,10],'min_samples_leaf':[10,15,20,30],'max_features':[2,3,4,6,8,12],'n_estimators':[10,50,100,200,300]}

In [None]:
grid_search=GridSearchCV(estimator=rf,param_grid=params,cv=4,n_jobs=-1,verbose=1,scoring='accuracy')

In [None]:
%%time
grid_search.fit(X_train,y_train_res)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
score=pd.DataFrame(grid_search.cv_results_)

In [None]:
score

In [None]:
rfc2=RandomForestClassifier(max_depth=6, max_features=3, min_samples_leaf=15,
                       random_state=42,n_estimators=100)

In [None]:
rfc2.fit(X_train,y_train_res)

In [None]:
y_test_pred=rfc2.predict(X_test)

In [None]:
metrics.accuracy_score(y_test_pred,y_test)

In [None]:
metrics.confusion_matrix(y_test_pred,y_test)

In [None]:
metrics.roc_auc_score(y_test_pred,y_test)

In [None]:
rfc2.predict([[75,0,582,0,20,1,265800,1.9,130,1,0,3]])