In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv


In this notebook I will compare the relative accuracy of a tuned decision tree versus an un-tuned random forest in predicting heart failure based off of some features from the data set.

In [2]:
# Import data
heart_data = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

# Inspect data
heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


Of the available features I intuitively choose some that make sense to me and define them as my feature selection.

In [3]:
# Select target object and call it y
y = heart_data.DEATH_EVENT

# Select features and call them X
features = ["age", "diabetes", "high_blood_pressure", "sex", "smoking"]
X = heart_data[features]

# Import ML tools
from sklearn.tree import DecisionTreeRegressor as dtree
from sklearn.ensemble import RandomForestRegressor as rforest
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import train_test_split

# Split the data intro training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [4]:
# Specify decision tree model
tree = dtree(random_state = 1)

# Fit decision tree
tree.fit(X_train, y_train)

# Make predictions on test set
prediction = tree.predict(X_test)

# Calculate mean absolute error
test_mae = MAE(prediction, y_test)
print("MAE of untuned decision tree: {}".format(test_mae))

MAE of untuned decision tree: 0.38288888888888883


In [5]:
# Tune model
for nodes in [50, 100, 200, 500, 1000, 2000, 5000, 10000]:
    tree = dtree(max_leaf_nodes = nodes, random_state = 1)
    tree.fit(X_train, y_train)
    mae = MAE(tree.predict(X_test), y_test)
    print("Nodes: {}, MAE: {}".format(nodes, mae))

Nodes: 50, MAE: 0.3985837742504409
Nodes: 100, MAE: 0.3784444444444444
Nodes: 200, MAE: 0.3784444444444444
Nodes: 500, MAE: 0.3784444444444444
Nodes: 1000, MAE: 0.3784444444444444
Nodes: 2000, MAE: 0.3784444444444444
Nodes: 5000, MAE: 0.3784444444444444
Nodes: 10000, MAE: 0.3784444444444444
