In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# importing the dataset
dataset = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

"""
SEX: male=1, female=0
DEATH_EVENT: 0=death, 1=survived
SMOKING: 0=No, 1=Yes
"""

In [None]:
# spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=123)

In [None]:
# Model selection, training, and prediction

model = LogisticRegression()
trained_model = model.fit(X_train, y_train)

plt.figure(1, figsize=(8, 5))
plt.scatter(X_test[:, 0], y_test, c='red', label="actual event")
plt.scatter(X_test[:, 0], trained_model.predict(X_test), 
            c='blue', marker='^', label="model prediction")
plt.legend()
plt.xlabel('Age (years)')
plt.ylabel('Heart failure event (1=death, 0=survived cardiac event)')
plt.title("Actual outcomes vs prediction outcomes")

In [None]:
# Given age what is the probability of survival
plt.figure(2, figsize=(8, 5))
plt.scatter(X_test[:, 0], trained_model.predict_proba(X_test)[:, 0], c="green")
plt.xlabel("Age (years)")
plt.ylabel("Survival probability")
plt.title("Probability of Survival Based on Clinical features")

In [None]:
# Visualization of feature coefficients 

featureNames = list(dataset.columns)
featureNames.remove("DEATH_EVENT")
modelCoefs = np.asarray(trained_model.coef_)
print(modelCoefs)

plt.figure(figsize=(10, 6))
plt.barh(featureNames, modelCoefs[0], color="red")
plt.ylabel("Clinical Features")
plt.xlabel("Feature Coefficient Value")
plt.title("Visualization of feature coefficients")


In [None]:
# Plot of Confusion matrix | rates of True Positives, False Positives, 
# False Negatives, and True Negatives

conf_mat = confusion_matrix(y_test, trained_model.predict(X_test))

df_cm = pd.DataFrame(conf_mat, range(2), range(2))
sn.set(font_scale=1.5)
sn.heatmap(df_cm, annot=True, annot_kws={"Size": 18}, 
           yticklabels=["Actual Survived", "Actual Deaths"], 
           xticklabels=["Predicted Survived", "Predicted Deaths"],
           cbar=False, linewidth=1)
plt.title("Model Confusion Matrix")
plt.xlabel("Model Based Predictions")
plt.ylabel("Actual Outcome")

In [None]:
# Visualizing the dataset
sn.histplot(X[:, 0], binwidth=2, kde=True)
plt.xlabel("Age (years)")
plt.title("Age distribution")
plt.show()

In [None]:
# Visualizing the Dataset: Gender differences (male==1; female==0)
# Death event: 

# Survival and Death rates based on gender
female = dataset[dataset["sex"]==0]
male = dataset[dataset["sex"]==1]

female_surv = female[female["DEATH_EVENT"]==0]
female_deaths = female[female["DEATH_EVENT"]==1]

male_surv = male[male["DEATH_EVENT"]==0]
male_deaths = male[male["DEATH_EVENT"]==1]

pie_data = [len(female_deaths), len(female_surv), len(male_deaths),
            len(male_surv)]
plt.pie(pie_data, 
        labels=["Female deaths", "Female Survived", 
                "Male Deaths", "Male Survived"], 
        colors=["goldenrod", "khaki", "red", "lightcoral"], radius=1.5,
        autopct="%1.2f%%")

In [None]:
# Correlation coefficients between featuer varibales

plt.figure(figsize=(20, 10))
sn.heatmap(data=dataset.corr(), annot=True)

In [None]:
# plotting the 

features = list(dataset.columns)
features.remove("anaemia")
features.remove("diabetes")
features.remove("high_blood_pressure")
features.remove("smoking")
features.remove("sex")

sn.scatterplot(x="age", y="DEATH_EVENT", data=dataset)
for i in range(1, 7):
    plt.figure(i)
    sn.scatterplot(data=dataset, x=features[i], y="DEATH_EVENT", color="red")

