In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading our data. 

> ###  Displaying the first five rows.

In [None]:
filepath = "../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv"
data = pd.read_csv(filepath)
data.shape
data.head()

<br>

#### Seeing if there's any empty entries.

In [None]:
data.isnull().sum()

#### there is none.
***

<font size="5">Regression Line of Age and Death</font>

In [None]:
idx_death = (data['DEATH_EVENT']==1)
idx_live = (data['DEATH_EVENT']==0)
data[idx_death]

age_norm = (data['age'] - np.mean(data['age']))/np.std(data['age'])
serum_creatinine = (data['serum_creatinine']-np.mean(data['serum_creatinine']))/np.std(data['serum_creatinine'])
plt.figure(figsize=(10,6))
sns.regplot(x=age_norm,y=data['DEATH_EVENT'],label="age")
plt.ylabel("death events")
plt.legend()



<font size=4>There is a positive correlation as expected.</font>

<font size=3> Let's see whether there is any other correlation between the other variables. </font>

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(data=data.corr(),annot=True)
data.columns

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x=data['sex'],y=data['smoking'])
plt.ylabel("Smoking")

<font size=4>Quite a discepancy in female and male ratio of smokers</font>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data2 = data.drop(['DEATH_EVENT'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(
                data2, data['DEATH_EVENT'], stratify=data['DEATH_EVENT'], random_state=0)
tree = DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))


In [None]:
#Plotting the scores with diferrent max depth

n = 12
training_score = np.zeros(n)
testing_score = np.zeros(n)
diff_score = np.zeros(n)
for i in range(1,n):
    tree = DecisionTreeClassifier(max_depth=i,random_state=0)
    tree.fit(X_train, y_train)
    training_score[i]= tree.score(X_train, y_train)
    testing_score[i] = tree.score(X_test, y_test)
    diff_score[i] = np.abs(training_score[i]-testing_score[i])
    
    
plt.figure(figsize=(8,5))    
sns.lineplot(x=range(n),y=training_score,label="training score")
sns.lineplot(x=range(n),y=testing_score,label="testing score")
plt.ylabel("Testing Score")
plt.xlabel("Maximum depth")
plt.legend()

In [None]:
tree = DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_train, y_train)

At maxdepth of 4, the difference between training score and testing score is the least. And subsequently they diverge.

In [None]:
feature_names =['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']

## Visualizing the procedure of our decision tree

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(tree, out_file="tree.dot", class_names=["live", "deceased"],
            feature_names=feature_names, impurity=False, filled=True)

import graphviz
with open("tree.dot") as f: 
    dot_graph = f.read()
      
graphviz.Source(dot_graph)

In [None]:
print("Feature importances:\n{}".format(tree.feature_importances_))

### the features that exhibit some form of importance are creatinine phosphokinase, ejection fraction, serum sodium and time. 