In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
# Check for Null Values
is_null = df.isnull().sum()[df.isnull().sum() > 0]
len(is_null)

In [None]:
df.describe()

In [None]:
df.DEATH_EVENT.value_counts().to_frame()

In [None]:
plt.figure(figsize=(15, 12))

plt.subplot(2,3,1)
sns.boxplot(x='DEATH_EVENT', y='age', data=df)
plt.title('Distribution of Age')

plt.subplot(2,3,2)
sns.boxplot(x='DEATH_EVENT', y='creatinine_phosphokinase', data=df)
plt.title('Distribution of creatinine_phosphokinase')

plt.subplot(2,3,3)
sns.boxplot(x='DEATH_EVENT', y='ejection_fraction', data=df)
plt.title('Distribution of ejection_fraction')

plt.subplot(2,3,4)
sns.boxplot(x='DEATH_EVENT', y='platelets', data=df)
plt.title('Distribution of platelets')

plt.subplot(2,3,5)
sns.boxplot(x='DEATH_EVENT', y='serum_creatinine', data=df)
plt.title('Distribution of serum_creatinine')

plt.subplot(2,3,6)
sns.boxplot(x='DEATH_EVENT', y='serum_sodium', data=df)
plt.title('Distribution of serum_sodium');

In [None]:
plt.figure(figsize=(15, 7))

plt.subplot(2,2,1)
sns.distplot(df[df['DEATH_EVENT'] == 1]['age'], label='DEATH_EVENT TRUE')
sns.distplot(df[df['DEATH_EVENT'] == 0]['age'], label='DEATH_EVENT FALSE')
plt.axvline(df[df['DEATH_EVENT'] == 1]['age'].mean(), 1, 0, color='black', label='Mean for TRUE')
plt.axvline(df[df['DEATH_EVENT'] == 0]['age'].mean(), 1, 0, color='blue', label='Mean for FALSE')
plt.ylim(0,0.04)
plt.legend()

plt.subplot(2,2,2)
sns.distplot(df[df['DEATH_EVENT'] == 1]['time'], label='DEATH_EVENT TRUE')
sns.distplot(df[df['DEATH_EVENT'] == 0]['time'], label='DEATH_EVENT FALSE')
plt.axvline(df[df['DEATH_EVENT'] == 1]['time'].mean(), 1, 0, color='black', label='Mean for TRUE')
plt.axvline(df[df['DEATH_EVENT'] == 0]['time'].mean(), 1, 0, color='blue', label='Mean for FALSE')
plt.legend()

plt.subplot(2,2,3)
sns.distplot(df[df['sex'] == 1]['age'], label='Men')
sns.distplot(df[df['sex'] == 0]['age'], label='Women')
plt.axvline(df[df['sex'] == 1]['age'].mean(), 1, 0, color='black', label='Mean for Men')
plt.axvline(df[df['sex'] == 0]['age'].mean(), 1, 0, color='blue', label='Mean for Women')
plt.ylim(0,0.04)
plt.legend()

plt.subplot(2,2,4)
sns.distplot(df[df['DEATH_EVENT'] == 1]['ejection_fraction'], label='SMOKER TRUE')
sns.distplot(df[df['DEATH_EVENT'] == 0]['ejection_fraction'], label='SMOKER FALSE')
plt.axvline(df[df['DEATH_EVENT'] == 1]['ejection_fraction'].mean(), 1, 0, color='black', label='Mean for TRUE')
plt.axvline(df[df['DEATH_EVENT'] == 0]['ejection_fraction'].mean(), 1, 0, color='blue', label='Mean for FALSE')
plt.legend();

In [None]:
from scipy import stats
def welch_test(a, b):
    t_value, p_value = stats.ttest_ind(a, b, equal_var=False)
    if p_value > 0.05:
        print('Null-Hypotheses cannot be rejected - both groups are equal')
    else:
        print('Null-Hypotheses can be rejected - significant difference between the two groups')
    return (t_value, p_value)

In [None]:
t, p = welch_test(df[df['sex'] == 0]['age'], df[df['sex'] == 1]['age'])
print(f'T-Value: {t}, P-Value: {p}')

In [None]:
t, p = welch_test(df[df['DEATH_EVENT'] == 0]['ejection_fraction'], df[df['smoking'] == 1]['ejection_fraction'])
print(f'T-Value: {t}, P-Value: {p}')

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
contigency = pd.crosstab(df['sex'], df['smoking'])
sns.heatmap(contigency, annot=True, cmap="YlGnBu");
contigency

In [None]:
contigency = pd.crosstab(df['DEATH_EVENT'], df['smoking'])
sns.heatmap(contigency, annot=True, cmap="YlGnBu");
contigency

# Logistic Regression

In [None]:
from sklearn import preprocessing
X_data = df.drop(['DEATH_EVENT'], axis=1).values
y_data = df['DEATH_EVENT'].values
X_data = preprocessing.StandardScaler().fit(X_data).transform(X_data)
X_data[0:2]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=4)
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

C_values = np.arange(0.1, 10, 0.1)
accuracy = np.zeros(len(C_values))
classification_reports = []
confusion_matrixes = []

for i, c in enumerate(C_values):
    LR = LogisticRegression(C=c, solver='liblinear').fit(X_train, y_train)
    yhat = LR.predict(X_test)
    accuracy[i] = metrics.accuracy_score(yhat, y_test)
    classification_reports.append(metrics.classification_report(y_test, yhat))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, yhat))

print('Maximum Accuracy achieved:', round(accuracy.max(), 4), 'with', C_values[accuracy.argmax()])
print(f'\nClassification Report:\n{classification_reports[accuracy.argmax()]}')
print(f'Accuracies: {accuracy[:3]}...')

In [None]:
sns.heatmap(confusion_matrixes[accuracy.argmax()], annot=True);

In [None]:
plt.plot(C_values, accuracy)
plt.title('Accuracy for each K-Nearest-Neighbour')
plt.xlabel('No. of nearest Neighbours')
plt.ylabel('Accuracy');

# K-Nearest-Neighbor

In [None]:
# Create K-Nearest Neighbour and calculate accuracy for different 'Ks'
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 25
accuracy = np.zeros((Ks-1))
classification_reports = []
confusion_matrixes = []

for n in range(1, Ks):
    # Train Model and Predict
    clm = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    yhat = clm.predict(X_test)
    accuracy[n-1] = metrics.accuracy_score(y_test, yhat)
    classification_reports.append(metrics.classification_report(y_test, yhat))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, yhat))

best_K = accuracy.argmax()+1
print('Maximum Accuracy achieved:', round(accuracy.max(), 4), 'with', best_K)
print(f'\nClassification Report:\n{classification_reports[best_K-1]}')
print(f'Accuracies: {accuracy}')

In [None]:
sns.heatmap(confusion_matrixes[best_K-1], annot=True);

In [None]:
plt.plot(range(1, Ks), accuracy)
plt.title('Accuracy for each K-Nearest-Neighbour')
plt.xlabel('No. of nearest Neighbours')
plt.ylabel('Accuracy');