In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Description

In [None]:
df=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

FEATURES:

1. AGE - AGE OF THE PATIENT
2. SEX - SEX OF THE PATIENT , (1:MALE , 0: FEMALE)
3. EXANG - EXERCISE INCLUDE ANGIA (1=YES, 0=NO)
4. CAA - NUMBER OF MAJOR VESSELS (0-3)
5. CP - CHEST PAIN TYPE (Value 1: typical angina, Value2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic
6. TRTBPS - RESTING BLOOD PRESSURE IN (MM|HG)
7. CHOL - CHOLESTROL IN (MG|DL) FETCHED VIA BMI SENSOR
8. FBS - (FASTING BLOOD SUGAR > 120 MG/DL) (1=TRUE, 0=FALSE)
9. REST-ECG -(RESTING ELECTROCARDIOGRAPHIC RESULTS) Value 0: normal, Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
10. THALACH -MAXIMUM HEAERT RATE ACHIEVED
11. OLDPEAK - PREVIOUS pEAK
12. SLP - SLOPE
13. EXNG - EXERCISE INDUCED ANGINA ~ 1 = YES, 0 = NO
14. TARGET -0=LESS CHANCE OF HEART ATTACK, 1= MORE CHANCE OF HEART ATTACK


# Data Visualization and Interpretation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
out=df['output'].value_counts()
plt.title('Finding the output ratio',size=20)
sns.barplot(x=out.index,y=out.values)

In [None]:
columns = ['age', 'cp', 'trtbps', 'chol', 'restecg', 'thalachh',
       'oldpeak', 'slp', 'caa', 'thall']
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20,10))
index =0
axs = axs.flatten()
for c in columns:
    sns.boxplot(y=c,data=df, ax=axs[index], color = 'red')
    index = index+1
plt.tight_layout(pad=0.4,w_pad=0.5,h_pad=5.0)

From the above boxplot we can conclude that there are some outliers present in columns like trtbs, chol, oldpeak and caa

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Plotting the count values for categorical columns.

In [None]:
col = ['sex','fbs','exng','cp','restecg','slp','caa','thall']
for i in col:
    test_df = df[i].value_counts()
    plt.figure(i)
    plt.title(i,size=20)
    sns.barplot(x=test_df.index,y=test_df.values)

Plotting the continuos column values.

In [None]:
col = ['trtbps', 'chol', 'thalachh']
for i in col:
    plt.figure(i)
    sns.lineplot(x="age",y=i, hue="output", data=df)
    plt.title("EFFECT OF HEART ATTACK WITH INCREASE IN AGE AND " +i.upper())

1. Trtbs and chol are not highly affecting the chances of a heart attack.
2. Thalachh value decreases with age and hence increases the chances of a heart attack.

Plotting the correlation matrix

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr().abs(),  annot=True)

In [None]:
df.corr()['output'].sort_values(ascending=False)


The above correlation graph says that the feature fbs is least correlated with the output hence it is not a deciding factor to determine whether a person will get a heart attack or not.
Whereas the feature like exng, oldpeak, cp, thalachh and caa are highly responsible for determining the probability of getting a heart attack.

Plotting the feature values for different output.

In [None]:
for i in df.columns[:-1]:
    g = sns.FacetGrid(df, col='output',height=7)
    g.map(sns.distplot, i, color="green")

The above plot gives a great overview of how the change in value of each feature can be responsible for getting a heart attack.
1. People in the age range 50-60 have a higher chance of getting a heart attack.
2. People having Sex=1 i.e male have a higher risk of getting a heart attack than female though the difference is not huge.
3. **If the chest pain type is of value 2 i.e atypical angina instead of 0 (typical angina) then it is a clear signal that the person is having a chance of heart attack**.
4. A cholestrol level of 200-300 can have a higher risk of getting heart attacks. Again this is not one of the prominent feature for decision making.
5. restecg = 1 (having ST-T wave abnormality) can increase the chances of getting a heart attack.
6. **thalachh (MAXIMUM HEAERT RATE ACHIEVED) in the value of 150-200 can lead to heart attack compared to the normal value of 125-175.**
7. **value of exng = 0 (EXERCISE INCLUDE ANGIA) is a huge contributing factor in getting a heart attack.**
8. **Person with the oldpeak (Previous peak) value of 0 can cause heart attack.**
9. if the slp value changes from 1 to 2 then the chances for the heart attack increases.
10. **caa = 0 (NUMBER OF MAJOR VESSELS reducing to 0) is a huge deceiding factor in getting a heart attack.**
11. thall (Thalium Stress Test result) value of 2 instead of 3 increase the risk of heart attack.

In [None]:
sns.pairplot(df, hue='output', height=5,palette='Dark2')

# Data Preprocessing

Searching for duplicates.

In [None]:
df[df.duplicated(keep=False)]

Removing the duplicate values

In [None]:
df.drop_duplicates(inplace=True)
df.shape

Seperating the categorical and continuous columns

In [None]:
cat_col = ['sex','cp','fbs','restecg','exng','caa','slp','thall']
con_col = ["age","trtbps","chol","thalachh","oldpeak"]

Performing one hot encoding on the categorical columns

In [None]:
df1 = df.copy()

In [None]:
df1 = pd.get_dummies(df1, columns = cat_col)


In [None]:
df1.head()

Splitting the feature and target columns.

In [None]:
y = df1['output']
x = df1.drop('output',axis=1)

Splitting the train and test data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

Standardizing the training and testing data.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Model Implementation

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
models = {'Logistic Regression': LogisticRegression(), 'SVC': SVC(), 'Decision Tree Classifier': DecisionTreeClassifier(), 'Random Forest Classifier': RandomForestClassifier(), 'Gradient Boosting Classifier': GradientBoostingClassifier()}

In [None]:
acc_score = {}

In [None]:
for name, model in models.items():
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    acc = accuracy_score(y_test, predict)
    acc_score[name]=acc
    cm = confusion_matrix(y_test,predict)
    print('----------------------------------------------------\n')
    print('                    '+name.upper())
    print("\nThe test accuracy score is ", acc)
    plot_confusion_matrix(model,x_test,y_test,cmap='rocket_r')
    print('\n')
    metrics.plot_roc_curve(model, x_test, y_test) 
    plt.show()
    print('-----------------------------------------------------------------------------------------------')

Hyper Parameter Tuning for SVC

In [None]:
svm = SVC(random_state=42)
parameters = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
searcher = GridSearchCV(svm, parameters)
searcher.fit(x_train, y_train)
print("The best params are :", searcher.best_params_)
print("The best score is   :", searcher.best_score_)
predict = searcher.predict(x_test)
acc = accuracy_score(y_test, predict)
acc_score['SVC_tuned']=acc
cm = confusion_matrix(y_test,predict)
print('----------------------------------------------------\n')
print('                    SVC_tuned')
print("\nThe test accuracy score is ", acc)    
print('\n')    
plot_confusion_matrix(model,x_test,y_test,cmap='rocket_r')    
metrics.plot_roc_curve(model, x_test, y_test) 
plt.show()    
print('-----------------------------------------------------------------------------------------------')

In [None]:
acc_score

In [None]:
plt.figure(figsize=(12, 6))
model_accuracies = list(acc_score.values())
model_names = list(acc_score.keys())
sns.barplot(x=model_accuracies, y=model_names, palette='YlOrBr');

In [None]:
maxKey = max(acc_score, key=lambda x: acc_score[x])
print('The model with highest Accuracy score is {0} with an accuracy of  {1:.2f}'.format(
    maxKey, acc_score[maxKey]))