In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
heart = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
heart.head()

About this dataset

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

n

In [None]:
heart.describe()

In [None]:
heart.shape

In [None]:
heart.isna().any()

# Lets Have A Few Univarent Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
import scipy 

*What we are looking for?*
1. mean, variance, standard deviation, mode, medium 
2. distribution (normal, skew, kurtosis)
3. "possible" outliers

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,5))
sns.distplot(heart.age, color="blue", ax=axs[0])
sns.boxplot(heart.age, color="red", ax=axs[1])
Image("https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Standard_deviation_diagram.svg/1920px-Standard_deviation_diagram.svg.png", width=400 )

We have relatively normal distributed age with mean = 54.36 and std = 9.08. We do not have a significant outlier and every thing seems great to progress :)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(20,5))
sns.countplot(heart.sex, ax= axs[0])
sns.countplot(heart.cp, ax= axs[1])

We have almost twice "1" gender than "0" gender. Chest Pain type 1 which is "typical angina" is most common chest pain type followed by "non-anginal pain". 

In [None]:
sns.countplot(heart.output)

We have relatively equal number of output cases. Hence, we are dealing with a balanced binary classification problem. 

In [None]:
fig, axs = plt.subplots(1,2, figsize=(20,5))
sns.distplot(heart.thalachh, color="blue", ax=axs[0])
sns.boxenplot(heart.thalachh, ax= axs[1])

We have slightly left-skewed distribution in "maximum heart rate achieved". Best way to show how data are behaving in left tail is boxen plot. As shown in the boxen plot, we have more distrubuted data in the left side compared to the right side. Also, we may have some outliers in the far left and far right sections. In general, the un-normality in the "maximum heart rate" is not too much that needed to be normalized. Although, we can check the result of normalization later. 

We can even move forward and calculate skewness and kurtosis. For quick remind, skew and kurtosis are showing the shape of the distribution. skew negative means the data have tailed toward negative values.

In [None]:
scipy.stats.skew(heart.thalachh)

As can be seen, we have slightly negative skew, which means the data have tail toward negative values.

In [None]:
scipy.stats.kurtosis(heart.thalachh)

The kurtosis of any univariate normal distribution is 3. Kurtosis < 3 ==> No significant outlier, kurtosis > 3 ==> there is a good chance for having outlier.

Note: You can find many other usefull info from skew and kurtosis. #DYOR

# Bivarent Analysis

*What we are looking for?*
1. correlations (positive, negative relation?, segnemtaion)
2. regression
3. constrains
4. outliers

In [None]:
heart.columns

Lets have a general pairplot to see what interesting pairs we can found:

In [None]:
sns.pairplot(heart)

Most of the features are kinda categorical variables which are turned to numbers. We will back to those variables later. Lets see how numerical variables are relating together with heatmap. 

In [None]:
numerical_features = ['age','trtbps','chol','thalachh','oldpeak']
plt.figure(figsize=(20,5))
sns.heatmap(heart[numerical_features].corr(), annot=True, vmin=-1, vmax=1)

It seems there is some strong negative correlation btw age and maximum heart rate achieved which is expected. We can see this correlation in the scatter plot (below). There is also some weak positive correlations btw "resting blood pressure" and "age". Lets check these:

In [None]:
sns.pairplot(heart[numerical_features])

In [None]:
plt.figure(figsize=(7,7))
sns.regplot(x=heart.age, y=heart.thalachh, data=heart)

In the chol data, it seems we have an outlier. Lets check it:

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,5))
sns.boxplot(data = heart['chol'], ax= axs[0])
sns.distplot(heart.chol, color='red', ax = axs[1])

We have a few outliers based on box plot. As can be seen in the right figure, there are some points above the maximum (380) and one obvious outlier above 500. The distribution plot also shows that we have relatively normally distributed data if we ignore the outliers. 

In [None]:
heart.drop(heart[heart['chol']>380].index, inplace=True)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,5))
sns.boxplot(data = heart['chol'], ax= axs[0])
sns.distplot(heart.chol, color='red', ax = axs[1])

Well, this looks much better :). 

P.S. domain expert is required for advance outlier detection. In some cases, chol>400 maybe a reasonable number, not an outlier. 

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,5))
sns.regplot(heart.age, heart.chol, ax = axs[0])
sns.regplot(heart.trtbps, heart.chol, color='red', ax = axs[1])

As can be seen in the right figure (blue one), with increasing the age we expected to increase chol. You can find many more interesting "general weak" trends with scatterplots or regplots.

Lets move a bit to advance analysis and include the categorical parameters as well:

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(heart.age, heart.chol, hue=heart.output)

In this figure, we can notice that generally at higher ages and high chol, we are expected more 0 output than 1 output :)

You can find more interesting info from categorical data using barplots: 

In [None]:
sns.barplot(x=heart.cp, y=heart.age)

# Model Developing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [None]:
X = heart.drop(['output'], axis=1)
y = heart.output

We are going to separate train, test, and validation sets. We are not going to touch test set before making sure about the performance of our model. However, we are going to play with train and validation sets to tune the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.40, random_state=42)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
model =LogisticRegression(penalty='l2', max_iter=1000)
model.fit(X_train, y_train)
predicted = model.predict(X_val)



In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predicted)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_val, y_val)

Well, we predicted 66 output correctly out of 85. We also have a wrong prediction in 19 outputs. Lets remove the regularization penalty to see what will happend:

In [None]:
model =LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
predicted = model.predict(X_val)
plot_confusion_matrix(model, X_val, y_val)

It does not change anything! We can try other ML algorithms:

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
predict_tree = model_tree.predict(X_val)
predict_tree_train = model_tree.predict(X_train)
accuracy_score(y_val, predict_tree), accuracy_score(y_train, predict_tree_train)

Well! we have a perfect match in the training set but weak prediction in the test set. What happend? OVERFITTING :)

max_depth, min_samples_split, and min_samples_leaf, min_weight_fraction_leaf and min_impurity_decrease are the hyperparameters which can be used to avoid overfitting. First three are stopping parameters and last two are purning parameters.

In [None]:
for max_d in range(1,5):
    for min_s in range(1,5):
        model_tree = DecisionTreeClassifier(max_depth=max_d, min_samples_leaf= min_s)
        model_tree.fit(X_train, y_train)
        predict_tree = model_tree.predict(X_val)
        predict_tree_train = model_tree.predict(X_train)
        acc_dif =abs((accuracy_score(y_val, predict_tree) - accuracy_score(y_train, predict_tree_train)))
        print(acc_dif, max_d, min_s)
            

If we set max_depth to 2 and min_samples_leaf to 1 to 4, we can have minimum accuracy difference btw training and test sets.

In [None]:
model_tree = DecisionTreeClassifier(max_depth=2, min_samples_leaf= 3)
model_tree.fit(X_train, y_train)
predict_tree = model_tree.predict(X_val)
predict_tree_train = model_tree.predict(X_train)
plot_confusion_matrix(model_tree, X_val, y_val)


In [None]:
cross_validate(model_tree, X_train, y_train, cv=3, scoring='r2',return_train_score=True)

Although we got almost 0.72 accuracy in binary classification using decision tree, cross validation shows our result is not repeatable for other cross sections. Hence, we are going to test Random Forest Classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rand = RandomForestClassifier()
model_rand.fit(X_train, y_train)
predict_rand = model_rand.predict(X_val)
predict_rand_train = model_rand.predict(X_train)
accuracy_score(y_val, predict_rand), accuracy_score(y_train, predict_rand_train)

We increased the accuracy a bit compared to decision tree but still we have OVERFITIING!

The main parameters used by a Random Forest Classifier are:

criterion = the function used to evaluate the quality of a split.

max_depth = maximum number of levels allowed in each tree.

max_features = maximum number of features considered when splitting a node.

min_samples_leaf = minimum number of samples which can be stored in a tree leaf.

min_samples_split = minimum number of samples necessary in a node to cause node splitting.

n_estimators = number of trees in the ensemble.

In [None]:
for ns in [50, 100, 150, 200, 250, 300, 350, 400]:
    model_rand = RandomForestClassifier(n_estimators=ns)
    model_rand.fit(X_train, y_train)
    predict_rand = model_rand.predict(X_val)
    predict_rand_train = model_rand.predict(X_train)
    acc_dif_rand =abs((accuracy_score(y_val, predict_rand) - accuracy_score(y_train, predict_rand_train)))
    print(acc_dif_rand, ns)

I will choose 150 n_estimators and to avoid overfitting I will add max_depth for each tree in random forest. You can play with hyperparameters and check cross validation score to avoid overfitting as much as possible. 

Is there any better method than this manual searching for optimal hyperparameters?

1. Random Search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
model_r = RandomForestClassifier()
distribution = {'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [4, 6, 8, 10],
                'min_samples_split': [5, 7, 10],
                'n_estimators':[50, 100, 150, 200] 
               }
model_rand = RandomizedSearchCV(estimator=model_r, param_distributions=distribution  , cv=4)
model_rand.fit(X_train, y_train)

In [None]:
model_rand.best_params_ , model_rand.best_score_

In [None]:
model_rand = RandomForestClassifier(n_estimators=50, max_depth=3, min_samples_leaf=8, min_samples_split=5, max_features='sqrt')
model_rand.fit(X_train, y_train)
predict_rand = model_rand.predict(X_val)
predict_rand_train = model_rand.predict(X_train)
accuracy_score(y_val, predict_rand), accuracy_score(y_train, predict_rand_train)

In [None]:
plot_confusion_matrix(model_rand, X_val, y_val)

2. Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
model_r_g = RandomForestClassifier()
dist = {'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [4, 6, 8, 10],
                'min_samples_split': [5, 7, 10],
                'n_estimators':[50, 100, 150, 200]}
model_rand_g  = GridSearchCV(estimator=model_r_g, param_grid=dist, cv=4)
model_rand_g.fit(X_train, y_train)

In [None]:
model_rand_g.best_params_, model_rand_g.best_score_

In [None]:
predict_rand_g = model_rand_g.best_estimator_.predict(X_val)
predict_rand_train_g = model_rand_g.best_estimator_.predict(X_train)

accuracy_score(y_val, predict_rand_g), accuracy_score(y_train, predict_rand_train_g)

Now we have 2 model candidates: 1. Logestic Regression and 2. Random Forest Classifier 

How they can perform in the test dataset?

In [None]:
predict_logestic_final = model.predict(X_test)
predict_rand_final = model_rand.predict(X_test)
print( 'Logestic Regression ==>', accuracy_score(y_test, predict_logestic_final))
print( 'Random Forest Classifer ==>', accuracy_score(y_test, predict_rand_final))

It seems RandomForest works better for our dataset :)

In [None]:
plot_confusion_matrix(model_rand, X_test, y_test)

Other Classification Methods?

XGboost

In [None]:
from xgboost import XGBClassifier
model_xg = XGBClassifier(n_estimators=1000,  learning_rate=0.05)
model_xg.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_val, y_val)])
xg = model_xg.predict(X_val)
xg_train = model_xg.predict(X_train)
accuracy_score(y_val, xg), accuracy_score(y_train, xg_train)