In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA - Exploratory Data Analysis and Feature engineering

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().any()

In [None]:
sns.countplot(x='target',data=df)
plt.show()

### Thalach and Exercise intensity 

In [None]:
## lower traingluar mask as correlation matrix is symmetric 
plt.figure(figsize=(10,10)) ## setting the size of the plot 
lower_triangle_mask  = np.zeros_like(df.corr())## a matrix of 0s whcih has same shape as df.corr()
lower_triangle_mask[np.triu_indices_from(lower_triangle_mask)]  = True ## setting the lower triangle indices to 1 for our mask 
sns.heatmap(df.corr()*100 , mask = lower_triangle_mask , cmap = 'RdBu_r',fmt='.0f',annot=True)
plt.show()

1. As we can see from the heatmap , The correlation values are not that high between any of the variables . The highest value is -58 between slope and oldpeak and the target variable has highest correlation with cp(chest pain) 

2. **Thalach is the only variable in the dataset which has comparatively high correlations** with many variables like target , slope , age , oldpeak , exang  and cp . This looks like an important variable in the dataset .According to the data description , thalach is maximum heart rate achieved 

A Quick google search reveals that maximum heart rate achieved is linked to exercise intensity and can be estimated by the formula  220 - Age 

Reference : https://www.mayoclinic.org/healthy-lifestyle/fitness/in-depth/exercise-intensity/art-20046887#:~:text=You%20can%20calculate%20your%20maximum,beat%20per%20minute%20during%20exercise.

In [None]:
## now we create a new feature exercise intensity which gives us the differnce between the estimated
## maximum heart rate and the maximum heart rate actually achieved which is thalach column 
df['exercise_intensity'] = 220 -df['age'] - df['thalach']
## plot of this variable to see if it really helps us predict heart disease 
sns.swarmplot(x='target',y='exercise_intensity',data=df,color='0.2')
sns.violinplot(x='target',y='exercise_intensity',data=df)
plt.show()

* **If exercise_intensity is less than 20 then we can say the chance of getting heart disease is high** and there's also a high chance the person doesnt have exercise induced angina . THerefore we could use that to modify exercise_intensity so that it takes value 1 when less than 20 and value 0 otherwise

This would capture information stored in thalach perfectly 

In [None]:
df.loc[df['exercise_intensity'] < 20,'exercise_intensity'] = 1
df.loc[df['exercise_intensity'] >= 20,'exercise_intensity'] = 0

In [None]:
## exercise intensity countplot 
sns.barplot(x='exercise_intensity',y='target',data=df)
plt.show()

### Chest Pain (cp) and Angina induced pain 

Cp (chest pain) is the variable with highest correlation with target <br> 
Reading the Data description , 

* 0- typical angina<br>
* 1- atypical angina<br>
* 2- non-anginal pain <br>
* 3- asymptomatic<br>

 So it is related to exang which is exercise induced angina , We can combine these two into one feature , grouping the types of angina  which have high risk of heart disease into one group and the types which have less risk of heart disease into another group  

In [None]:
sns.countplot(x='exang',hue='target',data=df)
plt.show()
sns.countplot(x='cp',hue='target',data=df)
plt.show()

* Exercise induced angina and typical angina can be grouped together as both have low risk of heart disease
* Atypical angina , asymptomatic and non anginal pain can be grouped together as they have high risk of heart disease

In [None]:
df.loc[ (df.cp == 0)|(df.exang == 1) ,'angina_combined'] = 0
df.loc[(df.cp != 0)|(df.exang == 0),'angina_combined' ] = 1 
sns.countplot(x='angina_combined',hue='target',data=df)
plt.show()

 We can safely remove both exang and cp from our feature set as we have combined the information provided by both of them into one feature 

### Oldpeak and Slope 

Oldpeak is another feature which has a decent amount of correlation with the target variable . Data Description tells us it is  the ST depression induced by exercise relative to rest . Slope has high correlation with this variable and we can combine the two into one variable using a Facet Grid 

* Slope = 0 and oldpeak <= 2  implies a higher risk of heart disease 
* Slope = 0 and oldpeak > 2 implies low risk for heart disease 
* Slope = 1 and oldpeak <= 2 implies low risk for heart disease
* Slope = 1 and oldpeak > 2 implies low risk for heart disease
* Slope = 2 and oldpeak <= 2 implies high risk for heart disease
* Slope = 2 and oldpeak > 2 implies low risk for heart disease

In [None]:
g = sns.FacetGrid(row='slope',col='target',data=df)
g.map_dataframe(sns.histplot,'oldpeak',bins=15)
plt.show()

In [None]:
df.loc[(df.oldpeak <= 2 )& (df.slope == 0) ,'ST_dep'] = 1
df.loc[(df.oldpeak > 2) & (df.slope == 0 ),'ST_dep'] = 0
df.loc[(df.oldpeak <= 2) & (df.slope == 1),'ST_dep'] = 0
df.loc[(df.oldpeak > 2) & (df.slope == 1),'ST_dep'] = 0
df.loc[(df.oldpeak <= 2) & (df.slope == 2),'ST_dep'] = 1
df.loc[(df.oldpeak > 2) & (df.slope == 2),'ST_dep'] = 0

In [None]:
sns.countplot(x='ST_dep',hue='target',data=df)
plt.show()

We can drop oldpeak and slope now from the dataset 

### Sex and Age 

In [None]:
sns.countplot(x='sex',hue='target',data=df)
plt.show()

From the countplots it can be seen that if you are a male then it's not exactly clear whether you have heart disease or not <br>

If we research a little we find out that Age , Gender and heart disease are realted and men can develop heart disease at younger ages and after 65 the risk for men and women becomes the same <br>
Reference : https://wa.kaiserpermanente.org/healthAndWellness/index.jhtml?item=%2Fcommon%2FhealthAndWellness%2Fconditions%2FheartDisease%2FageAndGender.html
<br>

We can combine age and sex to create a new feature age_sex which would be better predictor of heart disease for men using a facet grid <br>
* sex = 0 then quite clearly you have a high risk for heart disease 
* males under the age of 40 also have a high risk for heart disease in this dataset
* males between 40 and 50 also are at high risk  
* surprisingly , males above 50 have a low risk for heart disease 




In [None]:
g=sns.FacetGrid(data=df,row='target',col='sex')
g.map_dataframe(sns.histplot,'age',bins=40)
plt.show()

In [None]:
df.loc[df.sex==0,'age_sex']=1
df.loc[(df.sex == 1) &(df.age <= 40),'age_sex']= 1
df.loc[(df.sex == 1) &(df.age > 40)&(df.age <= 50),'age_sex']=1
df.loc[(df.sex == 1) &(df.age > 50)&(df.age <= 60),'age_sex']=0
df.loc[(df.sex == 1) &(df.age > 60),'age_sex']=0

In [None]:
sns.countplot(x='age_sex',hue='target',data=df)
plt.show()

### Rest ecg results and blocked arteries(ca)

In [None]:
sns.countplot(x='restecg',hue='target',data=df)
plt.show()

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

For value 0 , risk of heart disease is 50-50 and a little research about that tells us why . Ecg results in rest state cannot detect asymptomatic blockages in the arteries . But we know that ca(number of blood vessels colored by flouroscopy) tells us if such a blcokage exists . For lower values of ca like 0 it may be that the arteries are blocked and ecg is still normal 

Ref : https://choosingwiselycanada.org/ecg-electrocardiogram/#:~:text=However%2C%20it%20does%20not%20show,ECG%20or%20cardiac%20imaging%20test.

We can confirm this with the plot below 

In [None]:
f = sns.FacetGrid(data=df,col='ca')
f.map_dataframe(sns.countplot,'restecg' , hue='target')
f.add_legend()
plt.show()

Plot shows that if ca = 0  then even people with normal ecg have high risk which confirms the research . We can combine rest ecg results and the ca variable to make ca_new 

In [None]:
df.loc[(df.ca == 0 )& (df.restecg == 0) , 'ca_new'] = 1
df.loc[(df.ca == 0) & (df.restecg == 1) , 'ca_new']  = 1
df.loc[(df.ca == 1) & (df.restecg == 0) , 'ca_new'] = 0
df.loc[(df.ca == 1) & (df.restecg == 1) , 'ca_new'] = 0.5
df.loc[(df.ca == 2) & (df.restecg == 0) , 'ca_new'] = 0
df.loc[(df.ca == 2) & (df.restecg == 1), 'ca_new'] = 0.5
df.loc[(df.ca == 3) & (df.restecg == 0) , 'ca_new'] = 0
df.loc[(df.ca == 3) & (df.restecg == 1) , 'ca_new'] = 0.5
df.loc[(df.ca == 4) & (df.restecg == 0) , 'ca_new'] = 1
df.loc[(df.ca == 4) & (df.restecg == 1) , 'ca_new'] = 1
df.loc[df.restecg == 2 ,'ca_new'] = 0

In [None]:
df

### Features to be dropped 

* Cholestrol (chol) and resting blood pressure (trestbps) have all values concentrated within the 220-270 and 120 above range which are high values for blood pressure and cholestrol already . Values of cholestrol above 240 are considered to be risky for heart disease and values above 130 for blood pressure . Almost all the values here are above these thresholds . So I decided to drop these two features
* Age and Sex have been combined and we have used age feature in exrecise intensity as well so we can drop those two 
* Exang and cp can also be dropped as we have that information in angina_combined , oldpeak and slope are also dropped for the same reason 
* Thalach has been used in exercise intensity so we will drop it 
* fbs(fasting blood sugar) is a highly imbalanced variable and almost all the data points have fbs = 0 .We will drop this feature 


In [None]:
y = df.target
X =df.drop(['age','sex','cp','trestbps','chol','fbs','exang','oldpeak','slope','thalach','restecg','ca','target'],axis=1)
X_final = pd.get_dummies(X,columns=['thal'])
X_final

## Modelling and hyperparameter tuning 

#### 1. Train-Test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
X_train , X_test , y_train , y_test = train_test_split(X_final,y,test_size = 0.2,random_state = 12)

### KNN

In [None]:
knn_2 = KNeighborsClassifier() 
param_grid = {"n_neighbors": np.arange(1, 25)}
knn_gscv = GridSearchCV(knn_2, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)

In [None]:
knn_gscv.best_params_

In [None]:
print("cross_val_score",cross_val_score(KNeighborsClassifier(n_neighbors=9),X_train,y_train,cv=5,scoring='accuracy').mean())
final_knn = KNeighborsClassifier(n_neighbors=9)
final_knn.fit(X_train,y_train)
print("test_score",accuracy_score(final_knn.predict(X_test),y_test))

### SVM 

In [None]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
print("cross_val_score",cross_val_score(SVC(C = 10 , gamma = 0.01 , kernel ='rbf'),X_train,y_train,cv=5,scoring='accuracy').mean())
SVM_final = SVC(C = 10 , gamma = 0.01 , kernel ='rbf')
SVM_final.fit(X_train,y_train)
print("test_score",accuracy_score(SVM_final.predict(X_test),y_test))

### Logistic Regression 

In [None]:
scores = cross_val_score(LogisticRegression(),X_train,y_train,cv=5,scoring='accuracy')
print("cross_val_score",scores.mean())
log_final = LogisticRegression()
log_final.fit(X_train,y_train)
print("test_Score",accuracy_score(log_final.predict(X_test),y_test))

### Random Forests 

In [None]:
## Function to get the correct number of estimators 
def get_score(param):
    scores = cross_val_score(RandomForestClassifier(param,random_state = 0),X_train,y_train,cv=5,scoring='accuracy')
    return scores.mean()

In [None]:
dic = {x : get_score(x) for x in [100,200,300,400,500,600,700]}
plt.plot(dic.keys(),dic.values())
plt.show()

In [None]:
rf_final = RandomForestClassifier( 100,random_state = 0)
rf_final.fit(X_train,y_train)
print("cross_val_score",get_score(100))
print("test_score",accuracy_score(rf_final.predict(X_test),y_test))