In [None]:
# Importing libraries
## for data
import numpy as np
import pandas as pd

## for visualizations
import seaborn as sns
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns 

## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm

## for machine learning
from sklearn import model_selection, preprocessing, feature_selection, ensemble, metrics, decomposition
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
decision_tree_classifier = DecisionTreeClassifier()


In [None]:
# Importing the dataset

df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head(5)

In [None]:
# Identify the data types
df.info()

In [None]:
# Checking if any rows are missing any data.
df.isnull().sum()

* There is no missing data in this dataset.
* This meas that none of the rows have any null or NaN values, however there are boolean values (1, 0) that could be instead changed to True/False.

In [None]:
df.index

In [None]:
# Clean up data
cd = df.astype({'anaemia': 'bool', 'diabetes': 'bool', 'high_blood_pressure':'bool', 'smoking':'bool'})
cd.head()

In [None]:
# Count of unique rows in each column
cd.nunique()

The following fields are boolean and changed from 1/0 to True/False.
* anaemia
* diabetes
* high_blood_pressure
* smoking


In [None]:
# The values in the sex field is updated from 1/0 to Male/Female.
cd['sex'].replace(0, 'Female',inplace=True)
cd['sex'].replace(1, 'Male',inplace=True)
cd.head(5)

In [None]:
# Assess each subplot w/ age.
fig = make_subplots(rows=2, cols=3, start_cell="top-left", subplot_titles=("Age","Creatinine Phosphokinase", "Platelets","Serum Creatinine","Serum Sodium","Time", "Diabetes"))
fig.add_trace(go.Box(y=cd['age'], x=cd['sex'],boxpoints='all', name='Age'), row=1, col=1)
fig.add_trace(go.Box(y=cd['creatinine_phosphokinase'], x=cd['sex'],boxpoints='all', name='Creatinine Phosphokinase'), row=1, col=2)
fig.add_trace(go.Box(y=cd['platelets'], x=cd['sex'],boxpoints='all', name='Platelets'), row=1, col=3)
fig.add_trace(go.Box(y=cd['serum_creatinine'], x=cd['sex'],boxpoints='all', name='Serum Creatinine'), row=2, col=1)
fig.add_trace(go.Box(y=cd['serum_sodium'], x=cd['sex'], boxpoints='all',name='Serum Sodium'), row=2, col=2)
fig.add_trace(go.Box(y=cd['time'], x=cd['sex'], boxpoints='all', name='Time'), row=2, col=3) 
fig.update_layout(height=1400, width=1000,showlegend=False)
fig.show()

# Analysis
* The number of subjects in this study are between the ages of 40 and 95 (with the majority between 51 and 70). People over the age of 65 are more likely to resule in heart failure. 
* Creatinine Phosphokinase: Elevated CPK most often means there has been injury or stress to muscle tissue, the heart, or the brain The normal values for CPK is usually 10 to 120 mcg/L. Usually CPK tests are used to detect heart attacks. The median for men are 249 and for women its 250. Both are elevated. 
* Platelets: Normal values are from 150k to 450k. Platelets for both men and women are within normal range.
* Serum Creatinine: The kidneys are responsible for keeping the level of creatinine in the blood within a normal range. Normal values for men are 0.7 to 1.2 micromoles per liter, meanwhile normal values for women are 0.5 to 1.0 micromoles per liter. The median in the chart for men 1.1, and for women 1. Both are within normal range, though there were more men that exceeded the elevated levels. 
* Serum Sodium: Normal values are between 135 to 145 milliequivalents per liter. The median in the dataset for both men and women is 137, which is within normal range.

Key factors that stand out:
-age >65
-elevated CPK (gender not playing a role)
-serum creatine are within normal range, but the males are elevated on the high side. 

In [None]:
corr = cd.corr() 
corr_target = abs(cor["DEATH_EVENT"])
relevant_features = corr_target[corr_target>0.1]
relevant_features

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)
sns.set(font_scale=2)
ax = sns.heatmap(corr, annot=True, fmt='.4f', linewidths=.5, 
                 annot_kws={'size': 15})

In [None]:
ds = cd['high_blood_pressure'].value_counts().reset_index()
ds.columns = ['high_blood_pressure', 'count']
fig = px.bar(ds, x='high_blood_pressure', y="count", title='Count of people with High-blood pressure', width=400)
fig.show()


In [None]:
# Plot showing serum_sodium correlation with age.
plt.scatter(cd.age, cd.serum_sodium
, s=10 ) 
plt.title("Ages compared to Serum Sodium")
plt.xlabel("Age")
plt.ylabel("Serum Sodium")


In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x = df['age'],
    xbins=dict( 
        start=20,
        end=95,
        size=2
    ), 
    opacity=1
))

fig.update_layout(
    title_text='AGE DISTRIBUTION',
    xaxis_title_text='AGE',
    yaxis_title_text='COUNT', 
    bargap=0.10,
    xaxis =  {'showgrid': True },
    yaxis = {'showgrid': True },
    template = 'plotly'
)

fig.show()

In [None]:
#Correlation matrix 
corr = cd.corr()
corr

In [None]:
bp = cd[['serum_creatinine', 'DEATH_EVENT']].boxplot(by='DEATH_EVENT', figsize=(10,6))
bp.set_ylabel('serum_creatinine')