![](https://www.cdc.gov/heartdisease/images/HA-signs-symptoms-social2.png)
## Importing Library

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from warnings import filterwarnings
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate,KFold, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
## Setting RC param in MatplotLib for better Visulaziation
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)

filterwarnings("ignore")

## Data Reading & Data PreProcessing

In [None]:
heart_df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
heart_df

We have 303 Row & 14 Columns)

In [None]:
# Getting to know your data
heart_df.describe()

In [None]:
# No Missing Value is Present in Given Dataset
heart_df.isna().sum()

In [None]:
# All Columns are either interger or floating number and no missing values.
heart_df.info()

In [None]:
# Unique Value in all Columns of Datasets
heart_df.nunique()

In [None]:
# We have sub classify our column into three Catergory
oridinal_col = ['sex', 'fbs','exng', 'cp', 'restecg', 'slp', 'caa', 'thall']
numerical_col = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
output = ['output']

![](https://www.nhlbi.nih.gov/sites/default/files/styles/16x9_crop/public/2020-10/Learn%20What%20a%20Heart%20Attack%20Feels%20Like_October%202020_Final%20thumbnail.jpg?itok=-cu0N5Ea)
# Data Visualization

## UniaVarient Analysis

We have Plot two curve for each  numerical columns they are 
1. [Kernel Density Estimate](https://en.wikipedia.org/wiki/Kernel_density_estimation)
2. [Box-Plot](https://en.wikipedia.org/wiki/Box_plot)

Also We Add some Stats of that paticular features in their plot Which are:-

 Kdeplot
1. [Skewness](https://en.wikipedia.org/wiki/Skewness)
2. [Kurtosis](https://en.wikipedia.org/wiki/Kurtosis)
3. [Mean](https://en.wikipedia.org/wiki/Mean)
4. [Variance](https://en.wikipedia.org/wiki/Variance)    

Box-Plot
1. [Maximum](https://en.wikipedia.org/wiki/Maxima_and_minima)
2. [InterQuartile-Range](https://en.wikipedia.org/wiki/Interquartile_range)
3. [Median](https://en.wikipedia.org/wiki/Median)
4. [Minimum](https://en.wikipedia.org/wiki/Maxima_and_minima)

In [None]:
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(22, 40), gridspec_kw={
                       'width_ratios': [1.3, 1], 'wspace': 0.3, 'hspace': 0.3})
sns.set_style('whitegrid')

col = ['age', 'chol', 'trtbps', 'thalachh', 'oldpeak']
title = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Previous Peak']
colors = ["#5947ee", "#5f8de9", "#05548b", "#06a6b9", "#028f89"]
for i in range(5):
    label_density_plot = 'Skewness   ' + str(np.around(heart_df[col[i]].skew(), 3)) +\
        '\nKurtosis       ' + str(np.round(heart_df[col[i]].kurtosis(), 3)) +\
        '\nMean           ' + str(np.round(heart_df[col[i]].mean(), 3)) +\
        '\nVariance       ' + str(np.round(heart_df[col[i]].var(), 3))

    label_box_plot = 'Maximum  ' + str(heart_df[col[i]].max()) +\
        '\nInter Quartile Range  ' + str(np.percentile(heart_df[col[i]], 75) - np.percentile(heart_df['age'], 25)) +\
        '\nMedian   ' + str(heart_df[col[i]].median()) +\
        '\nMinimum  ' + str(heart_df[col[i]].min())

    sns.distplot(bins=60, kde=True,
                 a=heart_df[col[i]], norm_hist=True, color=colors[i], ax=ax[i, 0])
    ax[i, 0].legend([label_density_plot],
                    loc='upper right', fontsize='x-large')
    ax[i, 0].set_xlabel(title[i], fontdict={'size': 16})
    ax[i, 0].set_title(title[i]+'[Density Plot]', fontdict={'size': 20})

    sns.boxenplot(x=heart_df[col[i]], ax=ax[i, 1], orient='v', color=colors[i])
    ax[i, 1].legend([label_box_plot], loc='upper right', fontsize='x-large')
    ax[i, 1].set_title(title[i]+'[Box-Plot]', fontdict={'size': 20})
    ax[i, 1].set_xlabel(title[i], fontdict={'size': 16})
    ax[i, 1].set_ylabel('')

plt.show()

### Categorical Plot

In [None]:
col = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

title = ['Sex', 'Chest Pain', 'Fasting Blood Sugar', 'Resting Electrocardiographic',
         'Exercise induced angina', 'Slope', 'Number of Major Vessel', 'Thalium Stress Test']

legend = [['Female', 'Male'], ['Typical Angina', 'Atypical Angina', 'Non-Anginal Pain', 'Asymptomatic'],
          ['Less than 120 ', 'Greater than 120'], [
              'Normal', 'St-T wave abnormality', 'Hypertrophy'],
          ['No', 'Yes'], ['Downsloping', 'Flat', 'UpSloping'], [0, 1, 2, 3, 4],
          ['No Defect', 'Fixed Defect', 'Normal', 'Reversable Defect']]

sns.set_palette('Set2')

fig, ax = plt.subplots(3, 3, figsize=(30, 28), gridspec_kw={'hspace': 0.21})
k = 0

for i in range(3):
    if i == 2:
        for j in range(2):
            sns.countplot(x=heart_df[col[k]], ax=ax[i, j])

            ax[i, j].set_title(title[k], fontdict={'size': 30})
            ax[i, j].set_xlabel('')
            ax[i, j].set_ylabel('')
            ax[i, j].set_xticklabels(legend[k], fontdict={'size': 23})
            ax[i, j].legend('')

            for z in ax[i, j].patches:
                text = str(z.get_height()) + '(' + \
                    str(np.round(z.get_height()/len(heart_df), 2)) + ')'
                ax[i, j].annotate(
                    text, (z.get_x()+0.09, z.get_height()+1), size=24)

            k += 1

    else:
        for j in range(3):
            sns.countplot(x=heart_df[col[k]], ax=ax[i, j])

            ax[i, j].set_title(title[k], fontdict={'size': 30})
            ax[i, j].set_xticklabels(legend[k], fontdict={'size': 23})
            ax[i, j].set_xlabel('')
            ax[i, j].set_ylabel('')
            ax[i, j].legend('')

            for z in ax[i, j].patches:
                text = str(z.get_height()) + '(' + \
                    str(np.round(z.get_height()/len(heart_df), 2)) + ')'
                ax[i, j].annotate(
                    text, (z.get_x()+0.09, z.get_height()+1), size=24)

            k += 1

ax[0, 1].set_xticklabels(legend[1], fontdict={'size': 15})
ax[1, 0].set_xticklabels(legend[3], fontdict={'size': 20})

ax[2, 1].set_xticklabels(legend[7], fontdict={'size': 20})
text = "Every Single Bar has a\nNumber Which Show\nCount of that paticular\nValue and\nCount / Total Count"
ax[2, 2].annotate(text, (0.01, 0.4), size=40,
                  family="MS Mincho", color="#007696")
ax[2, 2].axis('off')
plt.show()

<b>Conclusion</b>
#### 1. There are twice as men as female on our dataset.
#### 2. Most People have Typical Angina Chest Pain as Compare to all other Chest Pain.
#### 3. Generally Fasting Blood Sugar has Value smaller than 120.
#### 4. There is 0.01% Chance you have Resting Electrocardiographc result as Hypertrophy.
#### 5. Around 67% people do not have angina include to their Daily exercise.
#### 6. 58% People have 0 Major Vessel in their body.

### Output

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

sns.set_style('whitegrid')
sns.countplot(x=heart_df['output'], ax=ax, palette="RdBu")

ax.set_title('Chance of Heart Attack', fontdict={'size': 20})
ax.set_xticklabels(['Less Chance of\nHeart Attack',
                    'High Chance of\nHear Attack'], fontdict={'size': 15})

for i in ax.patches:
    text = "Count --> " + str(i.get_height()) + '(' + \
        str(np.round(i.get_height()/len(heart_df), 2)) + ')'
    ax.annotate(text, (i.get_x()+0.1, i.get_height()+1.5), size=14)

ax.set_xlabel('')
ax.set_ylabel('')

plt.show()

<b>Conclusion</b>
#### The Data has around 46% of less chances of heart attack and 54% of high chance of Heart Attack

![](https://www.gethealthystayhealthy.com/sites/default/files/inline-images/heart-health-and-women_2_0.png)
## BiVarient Analysis

In [None]:
sns.set_style('darkgrid')

col = ['chol', 'thalachh', 'trtbps']

title = ['Cholesterol', 'Heart Rate', 'Blood Pressure']

fig, ax = plt.subplots(3, 1, figsize=(30, 30), gridspec_kw={"hspace": 0.25})

for i in range(3):
    g = sns.lineplot(x="age", data=heart_df, y=col[i], hue='output', ax=ax[i])
    g.set_title("Age Variation with " + title[i], fontdict={'size': 35})
    g.legend(['Low Chance of Heart Attack',
              'High Chance of Heart Attack'], fontsize=18)
    g.set_xlabel("Age", fontdict={'size': 25})
    g.set_ylabel(title[i], fontdict={'size': 25})
    
plt.show()

<b>Conclusion</b>
#####  1. People Having Low Cholesterol and Age tends to be more vulnerable to heart attack. 
####   2.With Increase in Age The Heart Rate tends to Decrease
####   3. There is no such relation b/w Blood Pressure and Age as but people with low Blood Pressure has higher chance to attack.

![](https://www.heartfoundation.org.nz/images/heart-healthcare/public/other/cholesterol-graphic.png)
## Multivarient Analysis

### Risk of Heart Attack vs All columns 

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(20, 28), gridspec_kw={'hspace': 0.35})

col = ['age', 'thalachh', 'chol', 'trtbps']

title = ['Age', "Heart Rate", "Cholesterol Level", "Blood Pressure"]

for i in range(4):
    sns.distplot(a=heart_df[heart_df['output'] == 1]
                 [col[i]], bins=60, color='red', ax=ax[i])
    sns.distplot(a=heart_df[heart_df['output'] == 0]
                 [col[i]], bins=60, color='green', ax=ax[i])
    
    
    ax[i].legend(['High Chance of Heart Attack',
                  'Low Chance of Heart Attack'], fontsize=18)
    ax[i].set_title(
        title[i] + " Distribution based on Heat Attack Chance", fontdict={'size': 20})
    
    ax[i].set_xlabel(title[i], fontdict={'size': 18})

plt.show()

<b>Conclusion:</b>
##### 1. There is no direct Relationship b/w age & Change of Heart Attack but it was seen Most Heart Attack are occur in age [20-55].
##### 2. As Cholesterol Increase Chances of Heart Attack also increase.
##### 3. People Having High Heart Rate tend to have more chances of Heart Attack
##### 4. Blood Pressure has no direct Relationship with Heart Attack as conclude by graph.

In [None]:
plt.figure(figsize=(30, 8))
ax = sns.distplot(a=heart_df[heart_df['output'] == 1]
                  ['oldpeak'], bins=6, color='red', kde=False)

ax = sns.distplot(a=heart_df[heart_df['output'] == 0]
                  ['oldpeak'], bins=6, color='green', kde=False)

ax.legend(['High Chance of Heart Attack',
           'Low Chance of Heart Attack'], fontsize=18)
ax.set_title("OldPeak Distribution based on Heat Attack Chance",
             fontdict={'size': 20})

ax.set_xlabel("Oldpeak", fontdict={'size': 18})

plt.show()

In [None]:
col = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

title = ['Sex', 'Chest Pain', 'Fasting Blood Sugar', 'Resting Electrocardiographic',
         'Exercise induced angina', 'Slope', 'Number of Major Vessel', 'Thalium Stress Test']

legend = [['Female', 'Male'], ['Typical Angina', 'Atypical Angina', 'Non-Anginal Pain', 'Asymptomatic'],
          ['Less than 120 ', 'Greater than 120'], [
              'Normal', 'St-T wave abnormality', 'Hypertrophy'],
          ['No', 'Yes'], ['Downsloping', 'Flat', 'UpSloping'], [0, 1, 2, 3, 4],
          ['No Defect', 'Fixed Defect', 'Normal', 'Reversable Defect']]

sns.set_palette('Set1')
fig, ax = plt.subplots(3, 3, figsize=(38, 38), gridspec_kw={'hspace': 0.12})
plt.suptitle("Red Mean Less Risk of Heart Attack\nBlue Mean High Risk Of Heart Attack", fontsize=30,
             fontweight="bold", )

k = 0
for i in range(3):
    if i == 2:
        for j in range(2):
            sns.countplot(x=heart_df[col[k]],
                          ax=ax[i, j], hue=heart_df['output'])
            
            ax[i, j].set_title(title[k], fontdict={'size': 32})
            ax[i, j].set_xlabel('')
            ax[i, j].set_ylabel('')
            ax[i, j].set_xticklabels(legend[k], fontdict={'size': 23})
            ax[i, j].legend('')
            
            for z in ax[i, j].patches:
                text = str(z.get_height()) + '(' + \
                    str(np.round(z.get_height()/len(heart_df), 2)) + ')'
                ax[i, j].annotate(
                    text, (z.get_x()+0.01, z.get_height()+1), size=22, )
                
            k += 1
    else:
        for j in range(3):
            sns.countplot(x=heart_df[col[k]],
                          hue=heart_df['output'], ax=ax[i, j])
            
            ax[i, j].set_title(title[k], fontdict={'size': 32})
            ax[i, j].set_xticklabels(legend[k], fontdict={'size': 23})
            ax[i, j].set_xlabel('')
            ax[i, j].set_ylabel('')
            ax[i, j].legend('')
            
            for z in ax[i, j].patches:
                text = str(z.get_height()) + '(' + \
                    str(np.round(z.get_height()/len(heart_df), 2)) + ')'
                ax[i, j].annotate(
                    text, (z.get_x() + 0.01, z.get_height()+1), size=22)
                
            k += 1
ax[0, 1].set_xticklabels(legend[1], fontdict={'size': 15})
ax[2, 1].set_xticklabels(legend[7], fontdict={'size': 20})
ax[2, 2].remove()

fig.tight_layout()
fig.subplots_adjust(top=0.94)

plt.show()

<b>Conclusion</b>
#####  1. Female Has more chance of heart Attack as compare to Men
#####  2. People having Non-Anginal Pain Must have a check as they are more vulnerable to heart attack.
#####  3. People who do not have have angina include in their daily exercise are more likely to have heart attack than one who does.
#####  4. The More Major Vessel you have less is chance of Heart attack

In [None]:
plt.figure(figsize=(20, 14))
sns.set_style("whitegrid")
corr = heart_df.corr(method='pearson')
mask = np.triu(np.ones(corr.shape))
sns.heatmap(data=corr, mask=mask, annot=True, cmap="YlGnBu",
            square=True, robust=True, linewidths=3)
plt.show()

<b>Discussion</b>
#### 1. You can Also calculate Spearman Correlation if you think there is non-linear Relation b/w features.
#### 2. Our Output has high Positive Correlation with Chest Pain, Heart Rate, Major Vessel.
#### 3. Out Output has high Negative Correlation with Exercise-Including-Angina, 'Old-Peak .
#### 4. You can Use Feature Selection Method to remove some feature if you want.Some Feature Selection technique are given below.
<ol>
    <li>Forward Selection</li>
    <li>Backward Selection</li>
    <li>Recursive Selection</li>
    <li>Select K Best</li>
</ol>

# Creating Model

In [None]:
scaler = MinMaxScaler()
df_copy = heart_df.copy()
col = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
scaler_dataframe = pd.DataFrame(scaler.fit_transform(heart_df[col]), columns= col)
for i in col:
    df_copy.loc[:, i] = scaler_dataframe.loc[:, i]
df_copy

### Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_copy.drop('output', axis = 1), df_copy['output'],
                                                   test_size= 0.2, random_state = 42)
linear_model = LogisticRegression()
linear_model.fit(X_train, y_train)
print("Train Accuracy of our Linear model is", linear_model.score(X_train, y_train))
print("Test Accuracy of our Linear model is ",linear_model.score(X_test, y_test))

### KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("Train Accuracy of our K-Neighbors model is", knn_model.score(X_train, y_train))
print("Test Accuracy of our K-Neighbors model is ",knn_model.score(X_test, y_test))

### Decision Tree Classifier

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
print("Train Accuracy of our Decision Tree model is", decision_tree.score(X_train, y_train))
print("Test Accuracy of our  Decision Tree model is ",decision_tree.score(X_test, y_test))

### Support Vector Classifier

In [None]:
svc =SVC()
svc.fit(X_train, y_train)
print("Train Accuracy of our Support Vector Classifier model is", svc.score(X_train, y_train))
print("Test Accuracy of our Support Vector Classifier model is ",svc.score(X_test, y_test))

### LGBMClassifier

In [None]:
lgbClassifier = LGBMClassifier()
lgbClassifier.fit(X_train, y_train)
print("Train Accuracy of our LGBM-Classifier model is", lgbClassifier.score(X_train, y_train))
print("Test Accuracy of our LGBM-Classifier model is ",lgbClassifier.score(X_test, y_test))

### BernoulliNB 	

In [None]:
bernouli_model = BernoulliNB()
bernouli_model.fit(X_train, y_train)
print("Train Accuracy of our Bernoulli-NB model is", bernouli_model.score(X_train, y_train))
print("Test Accuracy of our Bernoulli-NB model is ",bernouli_model.score(X_test, y_test))

### GaussianNB 	

In [None]:
gaussian_model = GaussianNB()
gaussian_model.fit(X_train, y_train)
print("Train Accuracy of our Gaussian-NB model is", gaussian_model.score(X_train, y_train))
print("Test Accuracy of our Gaussian-NB model is ",gaussian_model.score(X_test, y_test))

### NuSVC

In [None]:
nusvm = NuSVC()
nusvm.fit(X_train, y_train)
print("Train Accuracy of our Nu-SVC model is", nusvm.score(X_train, y_train))
print("Test Accuracy of our Nu-SVC model is ",nusvm.score(X_test, y_test))

## <b>Conclusion</b>
1. Decision Tree & LGBM-Classifier is over-fitting our dataset because their large difference b/w train and test accuracy.
2. K-Neighbors Classifier is working fine with our mode and having test accuracy of 83% which is good, but we can achieve more with help of cross validation and Grid Search.
3. Logistic Regression is Simplest Classifier which show 83% Test Accuracy and less difference b/w train and test accuracy.
4. Support Vector Machine, NSVC, Gaussian-NB, Bernoulli-NB has approx same and also best train & test accuracy.

## Grid Seach and Cross Validation

### K-Neighbors 
To Select the best K Value we can use also elbow method.

In [None]:
param_grid = {'n_neighbors': np.arange(1,25, 1)}
grid_seach_K_Neighbors  = GridSearchCV(KNeighborsClassifier(),param_grid, scoring = 'accuracy', n_jobs = -1)
grid_seach_K_Neighbors.fit(X_train, y_train)
print("Best K-Values for K-Neighbors is", grid_seach_K_Neighbors.best_params_['n_neighbors'])
print("Train Accuracy of our K-Neighbors model after Grid Search is", grid_seach_K_Neighbors.score(X_train, y_train))
print("Test Accuracy of our K-Neighbors model after Grid Search is ",grid_seach_K_Neighbors.score(X_test, y_test))

### Support Vector Classifier

In [None]:
param_grid = {'C': np.arange(0.1, 1, 0.1), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
grid_seach_svc = GridSearchCV(SVC(), param_grid, scoring= 'accuracy', n_jobs=-10)
grid_seach_svc.fit(X_train, y_train)
print("Best K-Values for Support Vector is", grid_seach_svc.best_params_)
print("Train Accuracy of our  Support Vector model after Grid Search is", grid_seach_svc.score(X_train, y_train))
print("Test Accuracy of our  Support Vectormodel after Grid Search is ",grid_seach_svc.score(X_test, y_test))

# Result

In [None]:
def get_result(model):
    name = model.__class__.__name__
    train_accuracy = model.score(X_train, y_train)
    test_accracy = model.score(X_test,y_test)
    return [[name,train_accuracy,test_accracy]]

models = [linear_model, decision_tree, bernouli_model, gaussian_model, nusvm, lgbClassifier, knn_model]
result = pd.DataFrame(get_result(models[0]), columns= ['Name', 'Train Accuracy', 'Test Accuracy'])
for i in range(1, len(models)):
    new_result = pd.DataFrame(get_result(models[i]), columns= ['Name', 'Train Accuracy', 'Test Accuracy'])
    result = pd.concat([result, new_result])
result.reset_index(drop = True)

# What we can add or new?
1. Use Different Parameter with Grid Search and search for optimal Solution.
2. Find Relation b/w Other Feature like Does Cholesterol Increase with increase in Heart Rate?
3. Trying add New Columns Like Body Mass Index?
4. Feature Adding and Removing.

## If you Like Please UpVote & Comment.
## Also if any Changes Please Mention.