In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Some of these I did not use because I didn't need to but I put them in there just incase i would need them later

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeClassifier # Our model
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier ,plot_tree
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
Purpose/Hope to predict: I hope to predict if a patient will die if they have certain symptoms of a patient with heart failure
Goal for quality of predictions: below 20% mae
Hypotheses: 
        - The most accurate model will be the random forest model because it uses multiple decision trees to predict and I will most likely be using alot of variables from the big dataset
        - features like high blood pressure, diabetes, sex, smoking will have the highest affect on my mae
        

# Accesing the data

Now that the notebook is setup by importing different libraries and code to access the data we can now start by accessing the data and turning it into a variable.
The code cell above contributed to setting up the notebook by importing different libraries and code. In the code cell under code is used to access the data file by setting up a path to it. the .describe is also used to print out the data in a table. This table just uses statistical variables like count, mean, min, max etc.
# Why I used heart failure data

This dataset had many variables I could use to predict a y value which was given in the data set (DEATH_EVENT). This dataset also had a gold medal and at the time 1138 upvotes.


In [None]:
train_file_path = '../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'

# Create a new Pandas DataFrame with our training data
heart_train_data = pd.read_csv(train_file_path)

#printing the data
heart_train_data.describe(include='all')

# Prepare the Data
We now need to start preparing the data by describing our x and y. Our X will be a list of variables from the data. These variables are used to predict our y variable which in this case will be if they die or not.

Before we can get a reference to our prediction target 'y' we first need to prepare our data so that there aren't any rows with missing values as our machine learning model doesn't know how to handle them.

## Select Features and Drop Missing Values
We will need to filter our data to data that is relevent and doesn't have any missing values. Using the dropna(axis=0) line of code allows us to drop rows of data that have missing values so all our data has the same amount of rows.


In [None]:
# Let's reduce our data to only the features we need and the target.
# We need to keep the prediction as part of our DataFrame for now.
#I have experimented with different variables inorder to decrease the mean absolute error from 46% to 18.54%
selected_columns = ['age', 'diabetes','high_blood_pressure', 'DEATH_EVENT', 'sex', 'smoking', 'anaemia', 'serum_creatinine', 'time', 'serum_sodium', 'platelets', 'ejection_fraction']

# Create our new training set containing only the features we want
prepared_data = heart_train_data[selected_columns]

# Drop rows from the selected_colums data that contain missing values
prepared_data = prepared_data.dropna(axis=0)

# Check that you still have a good 'count' value.
prepared_data.describe()

## Separate Features From Target
Now that we have a set of data (aka DataFrame) without any missing values. We now need to take out 'DEATH_EVENT' from our X value and set y as 'DEATH_EVENT'so we can use our X to predict our y.




In [None]:
y = prepared_data.DEATH_EVENT

# Drop the DEATh_event column (axis=1 indicates column, axis=0 indicates row)
X = prepared_data.drop('DEATH_EVENT', axis=1)

#Delete hashtag below to see specific data
X.head()
#y.head()

## One Hot Encode Categorical Data 
One Hot Encoding is necessary for categorical non-numerical data. In this case, our categorical data is for 'Sex' and can be 'male' or 'female'. We must do this because Decision Tree models in Scikit cannot work with non-numerical data. 

One Hot Encoding separates each of the options for 'Sex' into a separate column, where a 1 means that the row contains this category value and a zero indicates it must be another categorical value. Watch this video (https://www.youtube.com/watch?v=v_4KWmkwmsU) for more information about how and why this works.


The Pandas get_dummies function is the easiest way to One Hot Encode categorical data. Here's how it's done

In [None]:
#one_hot_X = pd.get_dummies(X)

#one_hot_X.head()

# Train a Model and Make Predictions
Now that we have data our model can digest, let's train a model on our data and make some predictions.

In [None]:
#specify the model. 
#For model reproducibility, set a numeric value for random_state when specifying the model
#HF for Heart Failure
HF_model = DecisionTreeRegressor(random_state=1)

# Fit the model
HF_model.fit(X, y)

In [None]:
#Making predictions using the data chosen in the X variable
predictions = HF_model.predict(X)
#predictions = HF_model.predict(X.head())
print(predictions)

# Splitting the data
We need to split the data into training and testing so the data can be fit to further models

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [None]:
# Specify the model
HF__model = DecisionTreeRegressor(random_state=1)

# Fit HF__model with the training data.
HF__model.fit(train_X, train_y)

In [None]:
# Making predictions of the split data
val_predictions = HF__model.predict(val_X)
print(val_predictions)

In [None]:
# Calculating the mean absolute error
#The failure rate is 18.6% reocurring 
val_mae = (mean_absolute_error(val_y, val_predictions))

print(val_mae)

# Calculating the best amount of leaf nodes
I came across a problem here with the Mean Absolute Error (mae) being 0 which is most likely something to do with the training data and the testing data being the same so the predictions will be similar.

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
max_leaf_nodes = 100
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# I tried to loop it with the candidate_max_leaf_nodes list but it didn't work
my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# Random Forest model
This is a random forest model. It takes random data from my selected data in order to make a prediction. 

In [None]:
# Defining the model
rf_model = RandomForestRegressor(random_state=1)

# fitting the model
rf_model.fit(train_X, train_y)

# Calculating the mean absolute error of my Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

# Decision tree classifier model
I used this model to help me visualise my data

In [None]:
# change the value of max_depth to change the size of the decision tree
death_predictor = DecisionTreeClassifier(max_depth=10)

# fitting the model on my training data
death_predictor.fit(train_X, train_y)

# Plotting the tree
plt.figure(figsize = (20,10))
plot_tree(death_predictor,
          feature_names=X.columns,
          class_names=['Died', 'survived'],
          filled=True)
plt.show()
#  Printing the mae
val_mae = (mean_absolute_error(val_y, val_predictions))
print(val_mae)

# Conclusion
My data was pretty accurate with a prediction success rate of 81.46%. I used different variables and models in order to get the lowest mean absolute error (mae). I originally had less variables in order to get a 18.66% success rate in the decision tree regressor but after a little more experimenting i reached 18.54% in the random forest model. Using different models I was allowed to have a larger chance of finding a lower mae. During this investigation I have found out different models that can be used to make predictions on patients chance of death if they have had heart failure.

Changing variables in the X varaibles allowed me to get a smaller MAE in order to have more accurate predictions. By using different amount of variables I have decreased my MAE by small amounts. By using different variables i have decreased my MAE by larger amounts from 40%-20%. This happened because those variables are used to evaluate my mae.

I used a decision tree regressor, classifier and random forest models because the regressor and random forest models were used to compare eachother to find the lowest MAE and the classifier was used to help visualise the data set. The random forest model had a lower MAE then the regressor in the end but before I made slight changes to the selected_coloumns the regressor had a MAE of 18.66%. I used more variables inorder to decrease my MAE in my random forest model

Hypotheses accuracy: My hypotheses was right. The random tree forest was the most accurate model at making predictions and the variables I stated made the largest difference on my mae.