In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
data[['sex','DEATH_EVENT']].groupby(['sex'],as_index=False).mean().sort_values(by='DEATH_EVENT',ascending=0)


In [None]:
data[['anaemia','DEATH_EVENT']].groupby(['anaemia'],as_index=False).mean().sort_values(by='DEATH_EVENT',ascending=0)


In [None]:
data[['diabetes','DEATH_EVENT']].groupby(['diabetes'],as_index=False).mean().sort_values(by='DEATH_EVENT',ascending=0)


In [None]:
data[['smoking','DEATH_EVENT']].groupby(['smoking'],as_index=False).mean().sort_values(by='DEATH_EVENT',ascending=0)


In [None]:
data['age'].value_counts()

In [None]:
data.loc[data['age']<=30,'age']=0
data.loc[(data['age']>30)&(data['age']<=45),'age']=1
data.loc[(data['age']>45)&(data['age']<=60),'age']=2
data.loc[(data['age']>60)&(data['age']<=75),'age']=3
data.loc[data['age']>75,'age']=4
data['age']=data['age'].astype(int)
data.head()

In [None]:
data[['age','DEATH_EVENT']].groupby(['age'],as_index=False).mean().sort_values(by='DEATH_EVENT',ascending=0)


In [None]:
plt.figure(figsize=(20,15))
sns.pairplot(data , hue='DEATH_EVENT' , palette='Set1' , corner = True)


In [None]:
plt.figure(figsize=(10,10))
corr=data.corr()
sns.heatmap(corr,annot=True)

In [None]:
outlier_percentage = {}
for feature in ['creatinine_phosphokinase', 'ejection_fraction', 'platelets','serum_creatinine','time','serum_sodium']:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)
    outlier_percentage[feature] = round((((tempData<(Q1 - 1.5 * IQR)) | (tempData>(Q3 + 1.5 * IQR))).sum()/tempData.shape[0])*100,2)
outlier_percentage

In [None]:
outlier = data[(data[feature]>Lower_range) & (data[feature]<Upper_range)].reset_index(drop=True)


In [None]:
X=data.drop('DEATH_EVENT',axis=1)
y=data['DEATH_EVENT']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
ex=ExtraTreesClassifier()
ex.fit(X,y)

In [None]:
ex.feature_importances_


In [None]:
plt.figure(figsize=(20,10))
plt.title('Feature importances')
feat=pd.Series(ex.feature_importances_,index=X.columns)
feat.nlargest(12).plot(kind='barh', color="r", align="center")
plt.tight_layout()
plt.show()

In [None]:
data.drop(['sex','high_blood_pressure','anaemia','age','smoking','diabetes','platelets','creatinine_phosphokinase','serum_sodium'],axis=1,inplace=True)


In [None]:
from sklearn.model_selection import train_test_split ,cross_val_score,RandomizedSearchCV
X_train ,X_test,y_train ,y_test =train_test_split(X,y,test_size =.2 , random_state=18 , stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

In [None]:
from tensorflow import keras

In [None]:
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [None]:
Model=keras.Sequential([keras.layers.Dense(100,activation='relu'),
                           keras.layers.Dense(1,activation='sigmoid')])

In [None]:
Model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
history=Model.fit(X_train,y_train,epochs=128,batch_size=64,validation_split=0.2,callbacks=[callback],verbose=1)

In [None]:
def plot_graphs(history,string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
  
plot_graphs(history, "acc")
plot_graphs(history, "loss")

In [None]:
Model1=keras.Sequential([keras.layers.Dense(100,activation='relu'),
                           keras.layers.Dense(1,activation='sigmoid')])
Model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
history=Model1.fit(X_train,y_train,epochs=20,batch_size=10,validation_split=0.2,callbacks=[callback],verbose=1)

In [None]:
y_pred_deep1=Model1.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

sns.heatmap(pd.DataFrame(confusion_matrix(y_pred_deep1.round(),y_test)) , annot=True)


In [None]:
print(classification_report(y_test, y_pred_deep1.round()))


In [None]:
from sklearn.linear_model import LogisticRegression ,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

In [None]:
models = {'Logistic Regression': LogisticRegression(),
         'KNN': KNeighborsClassifier(),
         'Decision Tree': DecisionTreeClassifier(),
         'Random Forest': RandomForestClassifier(),
         'Gradient Boosting Classifier':GradientBoostingClassifier(),
         'Support Vector Machine': SVC(),
         'Stochastic Gradien Descent': SGDClassifier(),
         'Naive Bayes': GaussianNB(),
         'xgb Classifier': XGBClassifier()}

In [None]:
def fit_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = cross_val_score(model,
                                             X_test,
                                             y_test,
                                            scoring='accuracy',
                                            cv=3
                                            ).mean()

    return model_scores

In [None]:
model_scores = fit_score(models,X_train,X_test,y_train,y_test)

model_scores

In [None]:
models = pd.DataFrame(model_scores, index=["accuracy"])
models.T.plot.bar(color="blue");

In [None]:
np.random.seed(18)
params = {"n_estimators": [50,100,200, 300,400 ] ,
 "max_depth": [ 3,5,7,9,10],
 "max_features" : [ 'auto', 'sqrt', 'log2']}


rs = RandomizedSearchCV(RandomForestClassifier(),
                                param_distributions=params,
                                cv=3,
                                n_iter=100,
                                verbose=0,
                              refit=True,
                                n_jobs=-1
                               )

rs.fit(X_train, y_train)

rs.best_params_

In [None]:
rs.best_estimator_


In [None]:
rs.best_score_

In [None]:
rs.score(X_test, y_test)


In [None]:
model = RandomForestClassifier(n_estimators= 100, max_features= 'log2', max_depth= 5)
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)

In [None]:
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred1)) , annot=True)


In [None]:
print(classification_report(y_test, y_pred1))
