In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Hi! My name is Alex and this is my first notebook in order to show you my approach to this problem. As this is my first notebook I will try to state clearly my hypothesis so we can discuss in the comments. I hope you can enjoy it as much as I enjoyed while I was writting this! :D

# Import data


In [1]:
heart = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")

### See the first five rows of the dataset to see if the data has been loaded well

In [1]:
heart.head()

### At first sight we can conclude there are 5 categorical data. Which are Sex, ChestPainType, RestingECG, ExerciseAngina and ST_Slope
### Let's see if the target variable is skewed. If so we have to use other techniques such as undersampling or oversampling.

In [1]:
heart["HeartDisease"].value_counts()

### Due to the fact the results are balanced (nearly 50% each one) we do not need to use those techniques :)

### Let's see info of the columns in order to get some insights of the dataset such as **null entries** or which columns have **categorical data**

In [1]:
heart.info()

### At this point we can conclude there are no null entries on the dataset. This doesn't mean that there is not null data because sometimes people can use different values to represent missing data, and Pandas will not recognize that :\ . But not everything is lost! We can use different methods to differentiate those points. Here I would like to start the EDA, and plot different graphs to have a wider view of the dataset

## *Exploratory Data Analysis*

### First let's import Seaborn to make the plots

In [1]:
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)}) #Bigger images

# Correlation matrix

### With the correlation matrix we can see the relationship between the different variables. The number has a range between 1 and -1 and represents if there is a **linear** relationship between two variables. It is important to remember that shows only **linear** relationships becuase a correlation of zero doesn't mean there is no relationship, it just shows there is no **linear** relationship.

### It is important to remember that this matrix is simetric so the info of the upper diagonal is the same as the one in the lower diagonal.

In [1]:
sns.heatmap(heart.corr(),cmap="YlGnBu")

### In order to see the relation between the different variables I will use a pairplot.

In [1]:
sns.pairplot(heart,hue="HeartDisease")

### With hue we can change the color regarding heart disease. With this plots we can see that oldpeak and restingBP has very long tails. This sometimes can be a symptom of data that has been loaded incorrectly or extreme values used to represent missing data.
### Let's see the box plots to see more clearly the outliers.

## Outliers

### Using boxplots it's easy and fast to see the outliers as it represents those points clearly on the plot.

In [1]:
sns.boxplot(data=heart)

### With a quick research we can see some values with RestingBP near zero, which is not possible. Also there are some values with
### Cholesterol near zero which is not possible also. Let's analizy how many of them are and how we can [impute](https://en.wikipedia.org/wiki/Imputation_(statistics)) them.
### Here I will replace the RestingBP equal zero with the median as this is more resistant to outliers that the mean
### Let's search the values with RestingBP equal zero.

In [1]:
sns.boxplot(y="RestingBP",data=heart)

### Let's search tthe outlier

In [1]:
heart[heart["RestingBP"]<50]

### Here we see the entry do not has RestingBP Cholesterol and FastingBS. I will drop it because it is one entry in one thousand. If 
### we had more data to represent we should use a method to imputate the values.

### Now let's search the values with zero value of Cholesterol.

In [1]:
sns.boxplot(y="Cholesterol",data=heart)

In [1]:
heart[heart["Cholesterol"]==0]

### Almost 10% of the dataset has this value missing. Before making assumptions I will see which is the distribution of HeartDisease within these points. Then I will justify a method to impute the values.

In [1]:
heart[heart["Cholesterol"]==0]["HeartDisease"].value_counts()

### Here we can see that the 88% of the entries here has a heart disease so this is highly skewed. In order to impute skewed data I will use the median of the values of Cholesterol with heart disease. I will do this because nearly all the entries have heart disease and the median of this values is resistant to outliers.

### This line of code gives me the non zero values of cholesterol with heart disease.

In [1]:
mask = (heart["Cholesterol" ]!= 0) & (heart["HeartDisease"] == 1)
heart.loc[mask]["Cholesterol"].median()

### I will impute with this value.

In [1]:
heart.loc[heart["Cholesterol"]==0,"Cholesterol"] = heart.loc[mask]["Cholesterol"].median()


### Let's see how the distribution is modified and if the correlation between heart disease and cholesterol has changed

In [1]:
sns.pairplot(data=heart,hue="HeartDisease")

### Here we can see that the distribution of Cholesterol has changed and also the correlation. From having a negative correlation with heart disease now it is positive. This makes sense as this is one of the first estimators a doctor uses to suggest further analysis.

In [1]:
heart.corr()["Cholesterol"]["HeartDisease"]

### Before finishing the EDA I will transform the categorical data into dummy variables. This means that the categorical data such as Sex, that has Male of Female values will be transformed into Sex_M and Sex_F with 0 or 1 with the corresponding entry. This is useful in distance based algorithms because those kind of algorithms needs the data to be normalized. The problem with this is that we are increasing the dimension of the dataset. If we are dealing with heavy datasets after this we should use some PCA to reduce the dimensionality.

### I will get the name of the columns with categorical type.

In [1]:
cat_cols = heart.select_dtypes("object").columns.to_list()
cat_cols

In [1]:
heart_dummy = pd.get_dummies(heart)

### Let's see how we increased the dimensions of the dataset.

In [1]:
heart_dummy.head()
len(heart_dummy.columns),len(heart.columns)

# Tree based algorithm
### Let's use the models to compare the scores in our treated data!
### First import the libraries
#### Disclaimer: *Personally I don't like to import everything before starting the notebook because it seems too many information to somebody that is starting like me. I prefer to import as I am going to use in order to see clearly where I will use it.*

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

### First let's separate the data into X and y.

In [1]:
X = heart_dummy.drop(axis=1,labels="HeartDisease")
y = heart_dummy["HeartDisease"]

### Split the data into train and test.

In [1]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

### I will use gridsearch to optimize hyperparameters and prevent overfitting over the training set.

In [1]:
Stimator = DecisionTreeClassifier(criterion="entropy",random_state = 101)
grid = {"max_depth" : [1,2,3,4,5,6,7,8,9]}

In [1]:
gso = GridSearchCV(Stimator,grid,cv=5)

### As we are dealing with tree based algorithms we do not need to normalize the data! So let's train.

In [1]:
gso.fit(x_train,y_train)

In [1]:
y_pred = gso.predict(x_test)

### Let's see the score of the model and the confusion matrix

In [1]:
from sklearn.metrics import confusion_matrix,accuracy_score,plot_confusion_matrix

In [1]:
confusion_matrix(y_test,y_pred)

In [1]:
plot_confusion_matrix(gso,x_test,y_test)

In [1]:
accuracy_score(y_test,y_pred)

### Let's see the depth of the tree.

In [1]:
gso.best_params_

In [1]:
Stimator_forest = RandomForestClassifier(criterion="entropy")
#To optimize information gain
grid_forest = {"n_estimators" : [1000,2000,3000,4000,5000]}
gso_forest = GridSearchCV(Stimator_forest,grid_forest,cv = 5)

In [1]:
gso_forest.fit(x_train,y_train)
y_predforest = gso_forest.predict(x_test)
accuracy_score(y_test,y_predforest)

In [1]:
gso_forest.best_params_

### Let's try with a XGBClassifier

In [1]:
Stimator_XGBC = XGBClassifier(use_label_encoder=False,n_estimators=2000)
Stimator_XGBC.fit(x_train,y_train)
y_predXGBC = Stimator_XGBC.predict(x_test)
accuracy_score(y_test,y_predXGBC)

# Distance Based Algorithm
### Let's import the libraries we are going to use

In [1]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [1]:
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [1]:
Stimator_svc = SVC(kernel="rbf")
grid_svc = {"gamma":[0.001,0.01,0.1,1,10],"C":[0.1,1,10,100,1000]}
gso_svc = GridSearchCV(Stimator_svc,grid_svc,cv = 5)

In [1]:
gso_svc.fit(scaled_x_train,y_train)
y_predsvc = gso_svc.predict(scaled_x_test)
accuracy_score(y_test,y_predsvc)


In [1]:
Stimator_logreg = LogisticRegression()
grid_logreg = {"C" : [0.1,1,10,100,1000]}
gso_logreg = GridSearchCV(Stimator_logreg,grid_logreg,cv=5)

In [1]:
gso_logreg.fit(scaled_x_train,y_train)
y_predlogreg = gso_logreg.predict(scaled_x_test)
accuracy_score(y_test,y_predlogreg)

# Finished
### To sum up, the model with best accuracy score was SVC. It can be optimized but at this point I will stop here.
### Thanks for reading! Finally, I would like to now how would you had replaced the entries with 0 cholesterol value and. of course, if you liked the post!
# Have a great day!