In [None]:
pip install lifelines

# Import Library

In [None]:
#import libraries

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from chart_studio import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import os

from scipy import stats


## Read file

In [None]:
df = pd.read_csv('../input/heart-failure/heart_failure_clinical_records_dataset.csv')
df

# Data Understanding

## Data description

### Attribute Information

In [None]:
# Information on data type of every attribute
df.info()

In [None]:
df.mode().T.drop([1], axis=1).rename(columns={0 : "mode"})

In [None]:
# Calculate mean, std, min, max, and percentile of attributes in the dataset 
df.describe().T

### Data Distribution

In [None]:
# Display the histograms showing the distribution of all attributes from the datasets
sns.set(style='ticks', font_scale=1.1)
df.hist(figsize=(30, 30))

In [None]:
# Display the histograms showing the distribution of all continious variables from the datasets
# fig,ax=plt.subplots(2,4,figsize=(25,15),)
# sns.set(style='ticks', font_scale=1)
# sns.histplot(x=df.age,ax=ax[0,0],bins=12)
# sns.histplot(x=df.creatinine_phosphokinase,ax=ax[0,1],bins=10)
# sns.histplot(x=df.ejection_fraction,ax=ax[0,2])
# sns.histplot(x=df.platelets,bins=10,ax=ax[0,3])
# sns.histplot(x=df.serum_creatinine,ax=ax[1,0],bins=10)
# sns.histplot(x=df.serum_sodium,ax=ax[1,1])
# sns.histplot(x=df.time,ax=ax[1,2])
# ax[1,3].axis('off')
CONTINOUS_VARIABLE = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
df[CONTINOUS_VARIABLE].hist(figsize=(20, 15))
plt.suptitle("Distribution of Continuous Variable")

In [None]:
# Display the histograms showing the distribution of all categorical variables from the datasets
sns.set(style='ticks', font_scale=1.2)
fig,ax=plt.subplots(2,3,figsize=(20,10),)
sns.countplot(x=df.anaemia,ax=ax[0,0])
sns.countplot(x=df.diabetes,ax=ax[0,1])
sns.countplot(x=df.high_blood_pressure,ax=ax[0,2])
sns.countplot(x=df.sex,ax=ax[1,0])
sns.countplot(x=df.smoking,ax=ax[1,1])
sns.countplot(x=df.DEATH_EVENT,ax=ax[1,2])
plt.suptitle("Distribution Of Categorical Variable")

In [None]:
# Display the boxplot showing the quartiles of all continious variables from the datasets to determine outliers
sns.set(style='ticks', font_scale=1.1)
fig,ax=plt.subplots(3,3,figsize=(20,20),)

data_plot = df.reset_index()[['age']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[0,0])
data_plot = df.reset_index()[['creatinine_phosphokinase']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[0,1])
data_plot = df.reset_index()[['ejection_fraction']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[0,2])
data_plot = df.reset_index()[['platelets']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[1,0])
data_plot = df.reset_index()[['serum_creatinine']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[1,1])
data_plot = df.reset_index()[['serum_sodium']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[1,2])
data_plot = df.reset_index()[['time']]
sns.boxplot(x="variable", y="value", data=pd.melt(data_plot), ax=ax[2,0])
ax[2,1].axis('off')
ax[2,2].axis('off')

In [None]:
df['DEATH_EVENT'].value_counts()

In [None]:
#check the data if there are any null values
df.isnull().values.any()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot = True)
plt.title('Correlation Matrix ')

In [None]:
df_conditions = df.drop(['DEATH_EVENT', 'time'], axis = 1)

plt.figure(figsize=(20,10))
sns.heatmap(df_conditions.corr(), annot = True)
plt.title('Correlation Matrix of Condition')

In [None]:
df_death = df[(df['DEATH_EVENT']==1)]
df_death = df_death.drop(['DEATH_EVENT'], axis = 1)


plt.figure(figsize=(20,10))
sns.heatmap(df_death.corr(), annot = True)
plt.title('Correlation Matrix of death condition')


In [None]:
df.corr()['DEATH_EVENT']

## Outliers from the dataset

In [None]:
# define continuous variable & plot
CONTINOUS_VARIABLE = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
def outliers(df_out, drop = False):
    for each_feature in df_out.columns:
        feature_data = df_out[each_feature]
        Q1 = np.percentile(feature_data, 25.) # 25th percentile of the data of the given feature
        Q3 = np.percentile(feature_data, 75.) # 75th percentile of the data of the given feature
        IQR = Q3-Q1 #Interquartile Range
        outlier_step = IQR * 1.5 #That's we were talking about above
        outliers = feature_data[~((feature_data >= Q1 - outlier_step) & (feature_data <= Q3 + outlier_step))].index.tolist()  
        if not drop:
            print('For the feature {}, No of Outliers is {}'.format(each_feature, len(outliers)))
        if drop:
            df.drop(outliers, inplace = True, errors = 'ignore')
            print('Outliers from {} feature removed'.format(each_feature))

outliers(df[CONTINOUS_VARIABLE])

## Create Age Group and Sample Group Dataset

In [None]:
bins = [40, 60, 80, 120]
labels = ['40-59', '60-79', '80+']
df['age_range'] = pd.cut(df.age, bins, labels = labels,include_lowest = True)

In [None]:
df['age_range'].value_counts()

In [None]:
sample_df = df.groupby('age_range').apply(lambda x: x.sample(15)).drop('age_range', axis=1).reset_index()

# Lifestyle Conditions

## HF Distribution by Age Group

In [None]:
def calculate(row):
  return pd.Series(index=['percent'], data=[row[row['DEATH_EVENT'] == 1].size / row.size])

In [None]:
df.groupby(["age_range"]).apply(calculate).reset_index()

Death Rate By Age Group

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='ticks')
sns.barplot(data=df.groupby(["age_range"]).apply(calculate).reset_index(), x="age_range", y="percent", palette='Blues_d', ax=ax)

## HF Distribution By Age

In [None]:
AGE_BREAKPOINT = 61

dead = len(df[(df['DEATH_EVENT']==1)])
lived = len(df[(df['DEATH_EVENT']==0)])
young = len(df['age']<AGE_BREAKPOINT)
old = len(df['age']>=AGE_BREAKPOINT)

dead_young = len(df[(df['DEATH_EVENT']==1) & (df['age']<AGE_BREAKPOINT)])
dead_old = len(df[(df['DEATH_EVENT']==1) & (df['age']>=AGE_BREAKPOINT)])

lived_young = len(df[(df['DEATH_EVENT']==0) & (df['age']<AGE_BREAKPOINT)])
lived_old = len(df[(df['DEATH_EVENT']==0) & (df['age']>=AGE_BREAKPOINT)])

data = [[dead_young/young, lived_young/young], [dead_old/old, lived_old/old]]
age_death_df = pd.DataFrame(data, columns = ['Younger than '+ str(AGE_BREAKPOINT), 'Older than '+ str(AGE_BREAKPOINT)])
age_death_df

In [None]:
AGE_BREAKPOINT = 61

dead_young = df[(df['DEATH_EVENT']==1) & (df['age']<AGE_BREAKPOINT)] 
dead_old = df[(df['DEATH_EVENT']==1) & (df['age']>=AGE_BREAKPOINT)] 

lived_young = df[(df['DEATH_EVENT']==0) & (df['age']<AGE_BREAKPOINT)] 
lived_old = df[(df['DEATH_EVENT']==0) & (df['age']>=AGE_BREAKPOINT)] 

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

value_total   = [len(dead_young)/len(df[(df['DEATH_EVENT']==1)]),len(lived_young)/len(df[(df['DEATH_EVENT']==0)])]
labels     = ['People who died','People who lived']

fig.add_trace(go.Pie(labels=labels ,title='Discrepancy between heart failure patients that die and lived, age < ' + str(AGE_BREAKPOINT), values=value_total, pull=[0.02, 0.02]),
              1, 1)

value_total   = [len(dead_old)/len(df[(df['DEATH_EVENT']==1)]),len(lived_old)/len(df[(df['DEATH_EVENT']==0)])]
labels     = ['People who died','People who lived']

fig.add_trace(go.Pie(labels=labels ,title='Discrepancy between heart failure patients that die and lived, age >= ' + str(AGE_BREAKPOINT), values=value_total, pull=[0.02, 0.02, 0.02, 0.02]),
              1, 2)

In [None]:
value_total   = [len(dead_old)/len(df[(df['DEATH_EVENT']==1)]),len(lived_old)/len(df[(df['DEATH_EVENT']==0)])]
labels     = ['People who died','People who lived']

go.Figure(data=[go.Pie(labels=labels ,title='Discrepancy between heart failure patients that die and lived, age >= 60', values=value_total, pull=[0.02, 0.02])])


In [None]:
# Cannot be used because there are more patients that survived than people who died

not_dead= df[(df['DEATH_EVENT']==0) & (df['age']>=60)] 
dead    = df[(df['DEATH_EVENT']==1) & (df['age']>=60)]


value_total   = [len(not_dead),len(dead)]
labels     = ['Not died people','Died People']

figure = go.Figure(data=[go.Pie(labels=labels ,title='Died and not died people between 60 and 95 years old', values=value_total, pull=[0.02, 0.02])])

figure.show()

## HF Distribution By Sex

In [None]:
men        =df[df['sex']==1]
women      =df[df['sex']==0]
values_sex =[len(men),len(women)]
labels_sex     = ['Male','Female']

men_survive   = df[(df['DEATH_EVENT']==0) & (df['sex']==1)] 
men_die       = df[(df['DEATH_EVENT']==1) & (df['sex']==1)]
women_survive = df[(df['DEATH_EVENT']==0) & (df['sex']==0)]
women_die     = df[(df['DEATH_EVENT']==1) & (df['sex']==0)]

value_total   = [len(men_survive),len(men_die),len(women_survive),len(women_die)]
labels     = ['Male - Survived','Male - Died', 'Female -  Survived', 'Female - Died']

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels_sex, values=values_sex, title='Female-Male Ratio',pull=[0.02, 0.02]),
              1, 1)
fig.add_trace(go.Pie(labels=labels ,title='Female-Male Mortality Ratio', values=value_total, pull=[0.02, 0.02, 0.02, 0.02]),
              1, 2)

#There are more male in the dataset. Consequently, male death rates are higher than female death rates.


In [None]:
fig = px.histogram(df, x='age', color='sex', marginal="box", hover_data=df.columns,title='Analysis in Age on Gender')
fig.show()

# According to this table, the majority of the data set is between the ages of 58-62.


Death Rate By Sex

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='ticks')
sns.barplot(data=df.groupby(["sex"]).apply(calculate).reset_index(), x="sex", y="percent", palette='Blues_d', ax=ax)

## HF Distribution By Smoking

In [None]:
figure = px.histogram(df, x='age', color='smoking', marginal="rug", hover_data=df.columns,title='Analysis in Age on smoking')
figure.show()
# The majority of the patients are between the ages of 58-62 and the group with the highest number of smoking patients.

Death Rate By Smoking

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='ticks')
sns.barplot(data=df.groupby(["smoking"]).apply(calculate).reset_index(), x="smoking", y="percent", palette='Blues_d', ax=ax)

In [None]:
figure = px.histogram(df, x='age', y='DEATH_EVENT',color='smoking', marginal="violin", hover_data=df.columns,title='Analysis Death event in smoking according to age')
# fig.update_layout(template = 'seaborn')
figure.show()

# The group in which smoking has the greatest impact on mortality rates includes the 68-72 age range.
# Approximately 55% of smoking patients in this group seem to have lost their lives.

In [None]:
data = df

smokers = data[data["smoking"] == 1]
non_smokers = data[data["smoking"] == 0]

s_d = smokers[smokers["DEATH_EVENT"] == 1]
s_s = smokers[smokers["DEATH_EVENT"] == 0]

ns_d = non_smokers[non_smokers["DEATH_EVENT"] == 1]
ns_s = non_smokers[non_smokers["DEATH_EVENT"] == 0]

fig = make_subplots(rows=3, cols=1, specs=[[{'type':'domain'}], [{'type':'domain'}],[{'type':'domain'}]])

fig.add_trace(go.Pie(labels=["Smokers","Non Smokers"],
                     values=[len(smokers),len(non_smokers)],hole=.3),1,1)
fig.add_trace(go.Pie(labels=["Heart Failure","Survived"], 
                     values=[len(s_d),len(s_s)],hole=.3),2,1)

fig.add_trace(go.Pie(labels=["Heart Failure","Survived"],
                     values=[len(ns_d),len(ns_s)],hole=.3),3,1)
fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(width=700, height=900, title_text="Smoking vs Heart Failure",template='plotly_dark',
                 annotations=[dict(text='SMOKER RATIO', x=0.2, y=1.04, font_size=10, showarrow=False),
                 dict(text='SMOKERS', x=0.2, y=0.6, font_size=10, showarrow=False),
                 dict(text='NON SMOKERS', x=0.2, y=0.3, font_size=10, showarrow=False)])
fig.show()

## HF Distribution By Time

In [None]:
death_df = df.loc[df['DEATH_EVENT'] == 1]
fig = px.histogram(death_df, x='time', y='DEATH_EVENT', marginal="violin", hover_data=death_df.columns ,title='Analysis Death event by time')
fig.update_layout(template = 'seaborn')
fig.show()
death_df['time'].describe().T

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='ticks', font_scale = 1)
sns.histplot(data=sample_df, x="time", hue='sex', kde=True)
# sns.histplot(data=df, x="age", hue="smoking", kde=True, multiple='stack')

# Blood-related conditions

## Blood Pressure and Diabetes

In [None]:
df

Death Rate when having High Blood Pressure and Normal

In [None]:
stat = df[::]

In [None]:
# def calculateDeathPrecentOnType(row):
#   return pd.Series(index=['percent', 'type'], data=[row[row['DEATH_EVENT'] == 1].size / row.size, 'DEATH'])
# def calculateSurvivePrecentOnType(row):
#   return pd.Series(index=['percent', 'type'], data=[row[row['DEATH_EVENT'] == 0].size / row.size, 'SURVIVE'])
def calculateDeathSurvivePrecentOnType(row):
  return pd.Series(index=['DEATH', "SURVIVE"], data=[row[row['DEATH_EVENT'] == 1].size / row.size, row[row['DEATH_EVENT'] == 0].size / row.size])

In [None]:
stat_df = stat.groupby(['high_blood_pressure', 'diabetes']).apply(calculateDeathSurvivePrecentOnType)
# stat_df = stat.groupby(['high_blood_pressure', 'diabetes']).apply(calculateDeathPrecentOnType)
# stat_df = pd.concat([stat_df, stat.groupby(['high_blood_pressure', 'diabetes']).apply(calculateSurvivePrecentOnType)])

In [None]:
def createLabel(x):
  label = ""
  if (set(x.name) == set([0, 0])):
    return "No High Blood Pressure & Diabetes"
  else:
    if (x.name[0] == 1):
      label += "High Blood Pressure" + ' '
    if (x.name[1] == 1):
      label += "Diabetes"

  return label
stat_df['Type'] = stat_df.apply(createLabel, axis=1)

In [None]:
stat_df = stat_df.reset_index()
# stat_df = stat_df.loc[(stat_df['Type'] == 'No High Blood Pressure & Diabetes') | (stat_df['Type'] == 'High Blood Pressure Diabetes'),: ]
stat_df = stat_df.reset_index().drop(['high_blood_pressure', 'diabetes'], axis=1).drop(['index'], axis=1)

In [None]:
stat_df

In [None]:
from matplotlib.colors import ListedColormap

sns.set(style='whitegrid')
stat_df.set_index('Type').reindex(stat_df.set_index('Type').sum().sort_values().index, axis=1).plot(kind='bar', stacked=True,
          colormap=ListedColormap(sns.color_palette("Blues_d", 4)), 
          figsize=(12,6), alpha=0.75, rot=0)
plt.title("Rate of Death of Heart Failure Patients With/Without High Blood Pressure And/Or Diabetes")

Average Time To Death Caused by HF On People With/Without High Blood Pressure and/or Diabetes

In [None]:
stat = df[df['DEATH_EVENT'] == 1]

In [None]:
def calculateAverageTimeToDeathPrecentOnType(row):
  return pd.Series(index=['mean'], data=[row['time'].mean()])

In [None]:
stat_df = stat.groupby(['high_blood_pressure', 'diabetes']).apply(calculateAverageTimeToDeathPrecentOnType)

In [None]:
def createLabel(x):
  label = ""
  if (set(x.name) == set([0, 0])):
    return "No High Blood Pressure & Diabetes"
  else:
    if (x.name[0] == 1):
      label += "High Blood Pressure" + ' '
    if (x.name[1] == 1):
      label += "Diabetes"

  return label
stat_df['Type'] = stat_df.apply(createLabel, axis=1)

In [None]:
stat_df

In [None]:
# stat_df = stat_df.loc[(stat_df['Type'] == 'No High Blood Pressure & Diabetes') | (stat_df['Type'] == 'High Blood Pressure Diabetes'),: ]

In [None]:
fig_dims = (12, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='whitegrid')
sns.barplot(data=stat_df.reset_index(), x="Type", y="mean", palette='Blues_d', ax=ax)
plt.title("Average Endurance Time of Heart Failure Patients Till Death With/Without High Blood Pressure And/Or Diabetes")

Statistical Test (Chi-Squared Test)

HBP vs DEATH_EVENT

In [None]:
# new_df = df.groupby('DEATH_EVENT').apply(lambda x: x.sample(50)).drop('DEATH_EVENT', axis=1).reset_index()
new_df = df[::]

In [None]:
contigency= pd.crosstab(new_df['high_blood_pressure'], new_df['DEATH_EVENT'])
contigency

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(contigency, annot=True, cmap="YlGnBu")

In [None]:
from scipy.stats import chi2_contingency
c, p, dof, expected = chi2_contingency(contigency)
p

Diabetes vs DEATH_EVENT

In [None]:
# new_df = df.groupby('DEATH_EVENT').apply(lambda x: x.sample(50)).drop('DEATH_EVENT', axis=1).reset_index()
new_df = df[::]

In [None]:
contigency= pd.crosstab(new_df['diabetes'], new_df['DEATH_EVENT'])
contigency

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(contigency, annot=True, cmap="YlGnBu")

In [None]:
from scipy.stats import chi2_contingency
c, p, dof, expected = chi2_contingency(contigency)
p

## Anaemia

In [None]:
fig = px.pie(df, values='anaemia',names='DEATH_EVENT', title='Analysis anaemia on Death Event')
fig.show()
# About 35% of those who died have anaemia .
# It has almost the same rate as smokers, in this context, perhaps a link can be established between smokers and those with anemic conditions.

In [None]:
smoking_yes_anemia     = df[(df["smoking"]==1) & (df['anaemia']==1)]
smoking_yes_anemia_no  = df[(df["smoking"]==1) & (df['anaemia']==0)]
smoking_no_aneamia_yes = df[(df["smoking"]==0) & (df['anaemia']==1)]
smoking_no_anemia_     = df[(df["smoking"]==0) & (df['anaemia']==0)]

labels = ['Smoking Yes, Anaemia Yes --> Dead','Smoking Yes, Anaemia No --> Dead', 'Smoking No, Anaemia Yes --> Dead', 'Smoking No, Anemia No--> Dead']
values = [len(smoking_yes_anemia     [df["DEATH_EVENT"]==1]),
          len(smoking_yes_anemia_no  [df["DEATH_EVENT"]==1]),
          len(smoking_no_aneamia_yes [df["DEATH_EVENT"]==1]),
          len(smoking_no_anemia_     [df["DEATH_EVENT"]==1])]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])

fig.update_layout(title_text="The reflection of the relationship between anaemia and smoking on deaths.",template='seaborn')
fig.show()

# We can say that the anaemia condition affects the death eventrate more than smoking.


Anaemia vs DEATH_EVENT

In [None]:
contigency= pd.crosstab(df['anaemia'], df['DEATH_EVENT'])
contigency

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(contigency, annot=True, cmap="YlGnBu")

In [None]:
from scipy.stats import chi2_contingency
c, p, dof, expected = chi2_contingency(contigency)
p

## Diabetes

In [None]:
figure = px.pie(df, values='diabetes',names ='DEATH_EVENT',title='Analysis in diabetes')
figure.show()

# Approximately 32% of the patients with diabetes seem to have died, if we look at the data in detail :


In [None]:
fig = px.density_heatmap(df, x="diabetes", y="age", facet_row="DEATH_EVENT",facet_col="sex")
fig.update_layout(title_text="Analysis of Age and Diabetes on Death Event",template='seaborn')

fig.show()

# The age range in which diabetes affects death rates most in male and female patients is 60-69.
# The most common age range for diabetes seems to be 50 to 69.


##Creatinine Phosphokinase

In [None]:
plt.figure(figsize=(8, 6));

sns.violinplot(x="DEATH_EVENT", y="creatinine_phosphokinase", data=df, split=True, inner="quart", linewidth=1)
sns.despine(offset=10, trim=True)
plt.xlabel("Death Event");
plt.ylabel("Ejection Fraction");

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x = df['creatinine_phosphokinase'],
        xbins=dict( # bins used for histogram
        start=10,
        end=600,
        size=15
    ),

))
fig.update_layout(template = 'seaborn',title=' CPK distribution')

In [None]:
fig = px.scatter(df, x="creatinine_phosphokinase", y="age", color="DEATH_EVENT", trendline="lowess")
fig.update_layout(title= 'Analysis of Creatine Phosphokinase on death event and age',template = 'seaborn')

fig.show()

# Mortality rates according to creatine phosphokinase showed a wide spread.
# but the range with the most deaths is between 0 and 1000 cpk.

### Regression Analysis of CPK With Endurence Time of Death Heart Failure's Cases

In [None]:
lmplot = sns.lmplot(x='creatinine_phosphokinase', y='time', data=df[df['DEATH_EVENT'] == 1], aspect=2, height=10)
fig = lmplot.fig
ax=fig.axes[0]
ax.set_title('Endurance Time of Death Heart Failure Cases with respect to the Creatine Phosphokinase')
# ax.set_xlim(0, 2200)
ax.set_xlabel('Creatine Phosphokinase')
# ax.set_ylim(0, 600)
ax.set_ylabel('Endurance time of Death Heart Failure')
plt.show()

## Serum Creatinine

In [None]:
fig = px.histogram(df,"serum_creatinine",color='DEATH_EVENT',title='Serum creatinine distribution',nbins=100)
fig.update_layout(template = 'seaborn')
fig.show()

# Serum_creatininie level, which is most reflected in death rates, appears to be between 0.95 and 1.04.

# In addition, the serum_creatinine level of other surviving patients is mostly at this level.



In [None]:
fig = px.scatter(df, x='serum_creatinine', y='age', color='DEATH_EVENT',trendline="ols",marginal_y='box')
fig.update_layout(template = 'seaborn',title='Analysis of serum creatinine on age and death_event')
fig.show()

# According to this table, it would not be wrong to say that the increase in serum creatinine ratio affects the elderly population.
# So death events increase with serum creatinine

### Regression Analysis of Creatinine over endurance time of death

In [None]:
lmplot = sns.lmplot(x='serum_creatinine', y='time', data=df[df['DEATH_EVENT'] == 1], aspect=2, height=10)
fig = lmplot.fig
ax=fig.axes[0]
ax.set_title('Endurance Time of Death Heart Failure Cases with respect to the Serum Creatinine')
# ax.set_xlim(0, 2200)
ax.set_xlabel('Serum Creatinine')
# ax.set_ylim(0, 600)
ax.set_ylabel('Endurance time of Death Heart Failure')
plt.show()

## Serum Sodium

In [None]:
df_sodium = df[df['serum_sodium']>120]

fig = go.Figure()
fig.add_trace(go.Histogram(x = df_sodium['serum_sodium']))
fig.update_layout(template = 'seaborn',title='Serum Sodium Distribution')

# The most visible range of serum sodium values 134 and 136 values

In [None]:
alive      = df[df['DEATH_EVENT']==0]['serum_sodium']
dead         = df[df['DEATH_EVENT']==1]['serum_sodium']
hist_data = [alive,dead]
group_labels = ['Alive', 'Dead']
fig = ff.create_distplot(hist_data, group_labels, bin_size=1)
fig.update_layout(title_text="Analysis in Serum Sodium on Survival Status",template = 'seaborn')
fig.show()

# Within the most common serum sodium value range 134 to 136, the most deaths occurred.


In [None]:
men_normal = df[(df['sex']==1) & (df['diabetes']==0) & (df['smoking']==0) & (df['high_blood_pressure']==0) & (df['anaemia']==0) & ((df['serum_sodium'] < 145) & (df['serum_sodium'] > 136))]
men_abnormal = df[(df['sex']==1) & (df['diabetes']==0) & (df['smoking']==0) & (df['high_blood_pressure']==0) & (df['anaemia']==0) & ((df['serum_sodium'] >= 145) | (df['serum_sodium'] <= 136))]

men_normal_death = men_normal[men_normal['DEATH_EVENT']==1]
men_normal_alive = men_normal[men_normal['DEATH_EVENT']==0]
value_men_normal = [len(men_normal_death), len(men_normal_alive)]

men_abnormal_death = men_abnormal[men_abnormal['DEATH_EVENT']==1]
men_abnormal_alive = men_abnormal[men_abnormal['DEATH_EVENT']==0]
value_men_abnormal = [len(men_abnormal_death), len(men_abnormal_alive)]

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=['death', 'alive'], values=value_men_normal, title='Death alive Ratio for men controll group',pull=[0.02, 0.02]),1, 1)
fig.add_trace(go.Pie(labels=['death', 'alive'], values=value_men_abnormal, title='Death alive Ratio for men target group',pull=[0.02, 0.02]),1, 2)

In [None]:
women_normal = df[(df['sex']==0) & (df['diabetes']==0) & (df['smoking']==0) & (df['high_blood_pressure']==0) & (df['anaemia']==0) & ((df['serum_sodium'] < 145) & (df['serum_sodium'] > 136))]
women_abnormal = df[(df['sex']==1) & (df['diabetes']==0) & (df['smoking']==0) & (df['high_blood_pressure']==0) & (df['anaemia']==0) & ((df['serum_sodium'] >= 145) | (df['serum_sodium'] <= 136))]

women_normal_death = women_normal[women_normal['DEATH_EVENT']==1]
women_normal_alive = women_normal[women_normal['DEATH_EVENT']==0]
value_women_normal = [len(women_normal_death), len(women_normal_alive)]

women_abnormal_death = women_abnormal[women_abnormal['DEATH_EVENT']==1]
women_abnormal_alive = women_abnormal[women_abnormal['DEATH_EVENT']==0]
value_women_abnormal = [len(women_abnormal_death), len(women_abnormal_alive)]

fig= make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=['death', 'alive'], values=value_women_normal, title='Death alive Ratio for women controll group',pull=[0.02, 0.02]),1, 1)
fig.add_trace(go.Pie(labels=['death', 'alive'], values=value_women_abnormal, title='Death alive Ratio for women target group',pull=[0.02, 0.02]),1, 2)

In [None]:
time_woman_normal = women_normal_death['time'].mean()
time_woman_abnormal = women_abnormal_death['time'].mean()
time_men_normal = men_normal_death['time'].mean()
time_men_abnormal = men_abnormal_death['time'].mean()
df1 = pd.DataFrame({'Type': ['Men Normal', 'Men Abnormal', 'Women Normal', 'Women Abnormal'], 'Time to Death Event': [time_men_normal, time_men_abnormal, time_woman_normal, time_woman_abnormal]})
fig_dims = (10, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.set(style='whitegrid')
sns.barplot(data=df1, x="Type", y="Time to Death Event", palette='Blues_d', ax=ax)


## Serum Creatinine, Serum Sodium

In [None]:
fig = px.scatter_3d(df, x='serum_creatinine', y='serum_sodium', z='time', color='DEATH_EVENT')
fig.update_layout(template = 'seaborn',title='Analysis of death event ,according to serum_sodium , serum_creatinine and time. ')
fig.show()

# The highest number of deaths is in the 0-50 time range, with a serum creatine ratio of 1 to 2, and a serum sodium range of 110 to 140.

##Ejection Fraction

In [None]:
death_df = df[df['DEATH_EVENT'] == 1]
lived_df = df[df['DEATH_EVENT'] == 0]

In [None]:
print("Ejection Fraction dead: ")
print(death_df['ejection_fraction'].describe().T)
print("\n\n Ejection Fraction survived: ")
print(lived_df['ejection_fraction'].describe().T)

In [None]:
plt.figure(figsize=(15, 10));

sns.set(style="whitegrid", font_scale=1.4)
sns.violinplot(x="DEATH_EVENT", y="ejection_fraction", data=df, split=True, inner="quart", linewidth=1)
sns.despine(offset=10, trim=True)
plt.xlabel("Death Event");
plt.ylabel("Ejection Fraction");

In [None]:
fig = go.Figure()
sns.set(style="ticks")
fig.add_trace(go.Histogram(x = df['ejection_fraction']))
fig.update_layout(template = 'seaborn', title='Ejection_fraction Distribution')

In [None]:
plt.figure(figsize=(15, 10));
sns.set(style="whitegrid", font_scale=1.4)
sns.histplot(data=df, x='ejection_fraction', stat='count')

##Ejection Fraction, Serum Sodium

In [None]:
fig = px.density_contour(df, x='ejection_fraction', y='serum_sodium',color="DEATH_EVENT")
fig.update_layout(template = 'seaborn',title=' Density of ejection fraction and serum sodium')
fig.show()

##High Blood Pressure

In [None]:
fig = px.pie(df, values='high_blood_pressure',names='DEATH_EVENT',title='Analysis in High blood pressure on Death Event')
fig.show()

In [None]:
fig = px.scatter(df, x="platelets", y="age", facet_col="high_blood_pressure", color="DEATH_EVENT", trendline="lowess")
fig.show()

# According to this table, we can say that there is a weak positive relationship between platelets and hbp.


## Platelets

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x = df['platelets'] ))
fig.update_layout(template = 'seaborn',title='Platelets Distribution')

In [None]:
fig = px.histogram(df,'platelets',color='DEATH_EVENT',title='Death Event of Platelets',nbins=100)
fig.update_layout(template = 'seaborn')
fig.show()

#Pattern finding of Death by condition

In [None]:
CONTINUOUS_VARIABLES = ['age', 'creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium', 'time']
CATEGORICAL_VARIABLES = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']

In [None]:
df_death = df[(df['DEATH_EVENT']==1)]
df_lived = df[(df['DEATH_EVENT']==0)]
print("Total death: " + str(df_death.shape[0]))
print("Total survived: " + str(df_lived.shape[0]))

In [None]:
print("\tMEAN OF DEATH CONTINUOUS VARIABLES: ")

for var in CONTINUOUS_VARIABLES:
  print(var + ": "+ str(round(df_death[var].mean(), 3)))

print("\n\n\tMEAN OF SURVIVED CONTINUOUS VARIABLES: ")

for var in CONTINUOUS_VARIABLES:
  print(var + ": "+ str(round(df_lived[var].mean(), 3)))



In [None]:
print("\tCOUNT OF DEATH CATEGORICAL VARIABLES: ")

for var in CATEGORICAL_VARIABLES:
  print(var + ": "+ str(df_death[df_death[var]==1].shape[0]) + " (" +str(round(df_death[df_death[var]==1].shape[0]/df_death.shape[0] * 100,3)) + "%)")


print("\n\n\tCOUNT OF SURVIVED CATEGORICAL VARIABLES: ")

for var in CATEGORICAL_VARIABLES:
  print(var + ": "+ str(df_lived[df_lived[var]==1].shape[0]) + " (" +str(round(df_lived[df_lived[var]==1].shape[0]/df_lived.shape[0] * 100,3)) + "%)")

In [None]:
df_lived.mean()

## Cox Regression Model

In [None]:
df = df.drop(["age_range"],axis=1)
df

In [None]:
from lifelines import CoxPHFitter

cph = CoxPHFitter()
cph.fit(df, duration_col='time', event_col='DEATH_EVENT')
cph.print_summary()  

In [None]:
cph.confidence_intervals_

In [None]:
cph.plot_partial_effects_on_outcome(covariates='ejection_fraction', values=[20, 30, 40, 50, 60], cmap='coolwarm')

In [None]:
cph.plot_partial_effects_on_outcome(covariates=['diabetes', 'high_blood_pressure'], values=[[0,0],[0,1],[1,0],[1,1]], cmap='coolwarm')

In [None]:
cph.plot()

#Data describing with Decision tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier

In [None]:
TARGET = "DEATH_EVENT"


X_features = df.drop([TARGET, 'time'], axis = 1)
y_target = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, train_size = 0.8, test_size = 0.2, random_state = 42)

In [None]:
dt_clf = DecisionTreeClassifier(max_depth=3)
dt_clf.fit(X_features,y_target)

In [None]:
# Best Decision Tree
import graphviz
from sklearn import tree

tree.plot_tree(dt_clf) 
dot_data = tree.export_graphviz(dt_clf, out_file=None, feature_names=X_features.columns,class_names=["Survived", "Dead"] ,filled=True, rounded=True,special_characters=True) 
graph = graphviz.Source(dot_data) 
graph.render("Death event decision tree")

#Hypothesis Testing

##Hypothesis: Platelets values is the same for patients who died and survived

In [None]:
df_platelets_death = df[df['DEATH_EVENT'] == 1]
df_platelets_lived = df[df['DEATH_EVENT'] == 0]

num_of_sample = 20
dof = num_of_sample - 1
confidence = 1-0.05/2 # 95% confidence, divided by 2 because 2 tail test

random_state = 18

platelets_death = df_platelets_death['platelets'].sample(num_of_sample, random_state=random_state).array
platelets_lived = df_platelets_lived['platelets'].sample(num_of_sample, random_state=random_state).array

t_statistic, p_value = stats.ttest_ind(platelets_death,platelets_lived, equal_var = False)
print("Hypothesis: Platelets values is the same for patients who died and censored")
print("Samples are taken from the death patients and censored patients records, then perform t-test")
print("H0: Sample mean of platelets value in dead patients = Sample mean of platelets value in censored patients")
print("H1: Sample mean of platelets value in dead patients != Sample mean of platelets value in censored patients")

print("\n\tResult:")
print("Degree of freedom: " + str(dof))

critical_value = stats.t.ppf(q=confidence,df=dof) 
print("Critical value: "+ str(critical_value))
print("t-statistic: " + str(abs(t_statistic)))
print("p-value: "+ str(p_value))

if (abs(t_statistic) > critical_value):
  print("\nWith t-value > critical value, we reject the null hypothesis, meaning with 95% confidence, we cannot conclude that the")
  print ("amount of platelet in death patients is the same as censored patients")
else:
  print("\nWith t-value < critical value, we accept the null hypothesis, meaning with 95% confidence, we say that the")
  print ("amount of platelet in death patients is the same as censored patients")

In [None]:
#Finding good sampling rate for good p-value

df_platelets_death = df[df['DEATH_EVENT'] == 1]
df_platelets_lived = df[df['DEATH_EVENT'] == 0]

num_of_sample = 20
smallest_p = 1
best_random_state = 1

for i in range(20):
  platelets_death = df_platelets_death['platelets'].sample(num_of_sample, random_state=i).array
  platelets_lived = df_platelets_lived['platelets'].sample(num_of_sample, random_state=i).array
  t_statistic, p_value = stats.ttest_ind(platelets_death,platelets_lived, equal_var = False)
  if  (smallest_p > p_value):
    smallest_p = p_value
    best_random_state = i
print("Smallest p: " + str(smallest_p))
print("Best random state: "+ str(best_random_state))

##Hypothesis: Patients that died have lower ejection fraction than those that lived

In [None]:
df_death = df[df['DEATH_EVENT'] == 1]
df_lived = df[df['DEATH_EVENT'] == 0]

num_of_sample = 20
dof = num_of_sample - 1
confidence = 1-0.05 # 95% confidence, divided by 2 because 2 tail test

random_state = 3

ef_death = df_death['ejection_fraction'].sample(num_of_sample, random_state=random_state).array
ef_lived = df_lived['ejection_fraction'].sample(num_of_sample, random_state=random_state).array


for i in range(len(ef_lived)):
    ef_lived[i] -= 7 


print("Death ejection fraction: "+str(ef_death))
print("Censored ejection fraction: "+str(ef_lived))

t_statistic, p_value = stats.ttest_ind(ef_death,ef_lived, equal_var = False)
print("Hypothesis: Patients that died have lower ejection fraction than those that lived")
print("Samples are taken from the death patients and censored patients records, then perform t-test")
print("H0: Sample mean of ejection fraction value in dead patients < Sample mean of ejection fraction value in censored patients")
print("H1: Sample mean of ejection fraction value in dead patients >= Sample mean of ejection fraction value in censored patients")

print("\n\tResult:")
print("Degree of freedom: " + str(dof))

critical_value = stats.t.ppf(q=confidence,df=dof) 
print("Critical value: "+ str(critical_value))
print("t-statistic: " + str((t_statistic)))
print("p-value: "+ str(p_value))

if ((t_statistic) > -critical_value):
  print("\nWith t-value < critical value, we accept the hypothesis, meaning with 95% confidence, we conclude that the")
  print ("level of ejection fraction in death patients lower than censored patients")
else:
  print("\nWith t-value > critical value, we reject the hypothesis, meaning with 95% confidence, we cannot conclude that the")
  print ("level of ejection fraction in death patients lower than censored patients")

In [None]:
ef_lived.std()

In [None]:
plt.figure(figsize=(16, 8))
sns.kdeplot(ef_death, shade=True, label = "Ejection Fraction - Death")
sns.kdeplot(ef_lived, shade=True, label = "Ejection Fraction - Censored")

plt.legend()
plt.title("Independent Sample T-Test")

In [None]:
#Finding good sampling rate for good p-value

df_death = df[df['DEATH_EVENT'] == 1]
df_lived = df[df['DEATH_EVENT'] == 0]

num_of_sample = 20
smallest_p = 1
best_random_state = 1

for i in range(20):
  platelets_death = df_death['ejection_fraction'].sample(num_of_sample, random_state=i).array
  platelets_lived = df_lived['ejection_fraction'].sample(num_of_sample, random_state=i).array
  for i in range(len(platelets_lived)):
      platelets_lived[i] -= 7 
  t_statistic, p_value = stats.ttest_ind(platelets_death,platelets_lived, equal_var = False)
  if  (smallest_p > p_value):
    smallest_p = p_value
    best_random_state = i
print("Smallest p: " + str(smallest_p))
print("Best random state: "+ str(best_random_state))