In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

### Check Data for Null values

In [None]:
sns.heatmap(df.isnull(),cmap='viridis',cbar=False)

### No Null values in our Data Frame

In [None]:
df.shape

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True)

### Since all our feature is numeric. Lets check how many of them are discrete and how many are contineous features

In [None]:
discrete_feature = [feature for feature in df.columns if len(df[feature].unique()) < 25]

In [None]:
print(discrete_feature)
print('Total Discrete feature :',len(discrete_feature))

In [None]:
for feature in discrete_feature:
    data = df.copy()
    
    data.groupby(feature)['DEATH_EVENT'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('DEATH_EVENT')
    plt.title(feature)
    plt.show()

In [None]:
sns.countplot(x='anaemia',hue='DEATH_EVENT',data=df)

### People with low anaemia are more prone to death event

In [None]:
sns.countplot(x='diabetes',hue='DEATH_EVENT',data=df)

In [None]:
sns.catplot(x='ejection_fraction',hue='DEATH_EVENT',data=df,kind='count');

In [None]:
sns.catplot(x='high_blood_pressure',hue='DEATH_EVENT',data=df,kind='count');

In [None]:
sns.countplot(x='sex',hue='DEATH_EVENT',data=df)

In [None]:
sns.countplot(x='smoking',data=df,hue='DEATH_EVENT')

In [None]:
contineous_feature = [feature for feature in df.columns if feature not in discrete_feature]

In [None]:
print(contineous_feature)
print('Total contineous feature count is  : ',len(contineous_feature))

In [None]:
df.head()

In [None]:
for feature in contineous_feature:
    data = df.copy()
    #data.groupby(feature)['DEATH_EVENT'].median().plot.bar()
    plt.scatter(data[feature],data['DEATH_EVENT'])
    plt.xlabel(feature)
    plt.ylabel('DEATH_EVENT')
    plt.title(feature)
    plt.show()

In [None]:
X_scale  = df.drop('DEATH_EVENT',axis=1)
y_scale = df['DEATH_EVENT']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
sc = StandardScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, test_size=0.3, random_state=0)

In [None]:
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

In [None]:
feature_for_model = SelectFromModel(Lasso(alpha=0.05,random_state=0))
feature_for_model.fit(X_train_scaled,y_train)

In [None]:
feature_for_model.get_support()

In [None]:
X.columns

In [None]:
#mask = feature_for_model.get_support()
#X_train = X_train.columns[mask]

In [None]:
X = X[['age','ejection_fraction','serum_creatinine','serum_sodium','time']]
y = df['DEATH_EVENT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, test_size=0.3, random_state=0)

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]


In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestClassifier()

In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
y_pred = rf_random.predict(X_test)

In [None]:
print(confusion_matrix(y_pred,y_test))

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
rf_random.best_params_

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))