In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ANALYSISING DATA

In [None]:
df=pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
y=df['DEATH_EVENT']
x=df.drop("DEATH_EVENT",axis=1)

In [None]:
df.isnull().sum()

NO NULL VALUES PRESENT

In [None]:
df.describe()

In [None]:
for i in df.columns:
    if df[i].nunique() == 2:
        print(i)

We can see that these 6 features have only 2 unique values.

In [None]:
import plotly.express as px
fig = px.histogram(df, x="age", color="DEATH_EVENT", marginal="box", hover_data=df.columns)
fig.show()

**Here we can see maximum deaths are in range of 58-62
And as age is increasing the chances of dying is increasing as compared to not dying**

In [None]:
import plotly.express as px
fig = px.histogram(df, x="time", color="DEATH_EVENT", marginal="box", hover_data=df.columns)
fig.show()

**Here we can see maximum number of patient died within 20-39 days of follow period
Another thing to notice is that if patient survive more than 80 days the chances of him surving are much higher than dying**

In [None]:
import plotly.graph_objs as go
male = df[df["sex"]==1]
female = df[df["sex"]==0]


labels = ['Male - Survived','Male - Not Survived', "Female -  Survived", "Female - Not Survived"]
values = [len(male[df["DEATH_EVENT"]==0]),len(male[df["DEATH_EVENT"]==1]),
          len(female[df["DEATH_EVENT"]==0]),len(female[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(title_text="Analysis on Survival - Gender")
fig.show()

In [None]:
import plotly.graph_objs as go
diabetic = df[df["diabetes"]==1]
non_diabetic = df[df["diabetes"]==0]


labels = ['diabetic - Survived','diabetic - Not Survived', "non diabetic -  Survived", "non diabetic - Not Survived"]
values = [len(diabetic[df["DEATH_EVENT"]==0]),len(diabetic[df["DEATH_EVENT"]==1]),
          len(non_diabetic[df["DEATH_EVENT"]==0]),len(non_diabetic[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(title_text="Analysis on Survival - Gender")
fig.show()

**chances of Surviving without dibatese is greater than chances of surviving with dibatese**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True,center=0)

**Not much relation is seen in features**

# APPLYING VARIOUS MODELS

Splitting the data into test set and train set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

Applying Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Applying Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, y_train)
y_pred = classifier1.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying K-nearest Neighbours**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier2.fit(X_train, y_train)
y_pred = classifier2.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying SVM**

In [None]:
from sklearn.svm import SVC
classifier3 = SVC(kernel = 'linear', random_state = 0)
classifier3.fit(X_train, y_train)
y_pred = classifier3.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying Kernel SVM**

In [None]:
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'rbf', random_state = 0)
classifier4.fit(X_train, y_train)
y_pred = classifier4.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, y_train)
y_pred = classifier5.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying Descision tree classification**

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier6.fit(X_train, y_train)
y_pred = classifier6.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**Applying Random Ftores**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier7 = RandomForestClassifier(n_estimators = 11, criterion = 'entropy', random_state = 0)
classifier7.fit(X_train, y_train)
y_pred = classifier7.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

Applying Xgboost

In [None]:
from xgboost import XGBClassifier
classifier8 = XGBClassifier()
classifier8.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier8.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

**WE CAN SEE ACCURACY FROM VARIOUS MODELS IS:-**
* Logistic Regression - 77.3%
* K-nearest Neighbours - 73.3%
* SVM - 78.6%
* Kernel SVM - 81.3%
* Naive Bayes - 66.6%
* Decision Tree - 81.3%
* Random forest - 90.6%
* Xg boost - 84%

**HIGHEST ACCURACY IS FROM RANDOM FOREST CLASSIFICATION WHICH IS 90.6%**