In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualisation purposes
from sklearn.tree import DecisionTreeClassifier ,plot_tree
from sklearn.metrics import accuracy_score
import category_encoders as ce
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Gather Data
#### Find Data:
Find appropriate data in csv format, cointaining numeric values to make it easier for the coding process.
I chose this data set because:
 - It included lots of numeric values.
 - It included a large selection of features.
 - The data set was also easy to comprehend and easy to use.
 
 
#### Add Data To Notebook:
Once the desired data set has been found add it to the notebook, 
to do this you have to:
1. Go into the data set and copy its title.
2. Then go into the notebook you want the data in.
3. On the top right hand side of screen there is an icon which reads 'Add data'.
4. Press on the add data icon and search with the title for the desired data set.


The data set should apear in the data section on the top right of the screen.

#### Why I chose the data set.
I chose this data set because it was mostly numerical data and it apeared to be simple and easy to use.

In [None]:
#Collect data to use in for the training and testing data to base the prediction model off.
train_file_path = '../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'
#Create a data frame
data = pd.read_csv(train_file_path)
#View the data set columns which will apear below once it runs.
data.describe(include='all')

# Prepare The Data
#### Choose what to find:
In prediction tree classifiers we are trying to find out if a feature has a specific value or not. 
In the data set I am using we are trying to find out whether someone has heart desease or not so our predicted target will be 'heart desease' however, there are other options to chose from such as 'gender' which would also work.

#### Select Features And Eliminate Others:
Selecting feaures can improve mean absolute error by eliminating irrelevent features, for example in this data set the feature 'work type' is irrelevant to what we are trying to find out. But other reasons to not use features might include features have missing values and non numerical values. 

To do this you Select features which are going to help improve and make it easier for coding and the rest will be discarded.


#### Why we do this.
We want to prepare data because it will improve accuracy by removing outliers and null values.

In [None]:
#prepare data
#Choose the features which you want - the most important and useful ones.
selected_columns = ['gender', 'age', 'hypertension', 'heart_disease']

#create the new training set which will have the chosen features.
prepared_data = data[selected_columns]

# Drop rows (axis=0) that contain missing values - we do not want these as they will be unreliable.
prepared_data = prepared_data.dropna(axis=0)

#Check and view the new data set to ensure that everything is ok and working as it should.
prepared_data.describe(include='all')

# Split data into training and testing data
for our data we want to spplit the data into training and testing data.

#### Training data
We use training data to train the predicter and fit the predictions to the data.

#### Testing data
We use testing data to see how good the predictions are, this is done by seeing how accurate the predictions are to feature values for.

#### Why we do this.
So we can expose model to different data which can improve accuracy and we can use data to compare predictions with.

In [None]:
# Separate out the prediction target
y = prepared_data.heart_disease

# Drop the target column from the original dataframe and we will now use the rest as our feature data
X = prepared_data.drop('heart_disease', axis=1)

# View data
X.head()
#y.head()
y.head()

# One Hot Encode
Machine learning most of the time only works with numerical data.
Our data has values such as 'Nan' which we dont want.
To eliminate this and convert it to numerical data we one hot encode it. 
This makes everything in binary making 0 meaning that it does not fit the catagory and 1 does.

In [None]:
# One hot encode the features chosen above 3which it will only do for features with numeric values.
one_hot_X = pd.get_dummies(X)

one_hot_X.head()

# Choosing/Training A Model
Now that we have made the data set to be useable by the model we need to train and make predictions.
In this model we want to see if someone has heart desease or not. To do this we make it so desease is 1 and heart ok is 0. 

Hyperparameters I set max depth to 3 so there wouldnt be overfitting.d

In [None]:
# Make a decision tree classifier with a depth of 3 for simple viewing.
# Changing the max depth will change how big or small the tree is.
heart_disease = DecisionTreeClassifier(max_depth=3)

# Using one hot encoded data to train the dataset.
heart_disease.fit(one_hot_X, y)

# Plot the tree.
plt.figure(figsize = (20,10))
plot_tree(heart_disease,
          feature_names=one_hot_X.columns,
          class_names=['Heart ok', 'heart disease'],
          filled=True)
plt.show()
# The values are 0 for heart ok and 1 for heart desease.

As shown above all the predictions are heart ok. 
This is obviously not ideal since every prediction is the same.
This may be due to many factors but the most likely is that the data set did not include enough features and a more appropriate data set could have been used instead in order to avoid this.

# Evaluate and tune hyperparameters
Now that we have a functioning, we can how accurate its first predictions are.

#### Why we do this
We do this because it gives us a good indication of how accurate it is.

In [None]:
print("Making predictions for the first 5 people in the training set.")

# View the first predictions displayed on a list form
pred = heart_disease.predict(one_hot_X)

print("The predictions are:")

# Merge target values and predictions back with our original features to see how well the predictions went.
X['Heart ok'] = y
X['Predicted'] = pred

X.head()

# Find how accurate the model is
To find out how accurate the model is we find out accuracy score which compares predicitons and data. This helps us visualise how close our predictions where to the validation data. The higher the score the more accurate the lower the score the higher the error. Ideally the score would be as high as possible.

We do this basically because it tells us whether our predictions where succesful or not.

In [None]:
#Find out the accuracy score.
acc_svc = accuracy_score(pred, y)
print(acc_svc)

# Conclusion
As shown the accuracy score was extremely high, at first this looked positive but after review of the prediction model all the predictions where 'heart ok' which means that infact the data set is possibly not offering enough information to make a more realist prediction. To do this a better data set could have been used.