In [None]:
# Importing necessary Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
#Importing input data
train=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df_train=pd.DataFrame(train)

# Overview of the dataset

In [None]:
df_train.shape

In [None]:
df_train.head()

** Dataset Description:** 
* age : Age of the patient 
* sex : Sex of the patient 
* cp : Chest Pain 
* trtbps : resting blood pressure (in mm Hg) 
* chol : cholestoral in mg/dl 
* fbs : fasting blood sugar 
* restecg : resting electrocardiographic results 
* thalachh : Maximum heart rate exang: exercise induced angina 
* oldpeak : Previous peak slp : Slope caa: number of major vessels 
* output : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
df_train.info()

In [None]:
for i in df_train.columns:
    print(f'total unique values for {i} column is: {df_train[i].value_counts().count()}')

**We can divide the column into categorical and numerical valued columns from above information**

In [None]:
cat=['thall','caa','slp','exng','restecg','fbs','cp','sex']
num=['age','thalachh','chol','trtbps']

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df_train.corr(),annot=True,cmap='Blues')
plt.title('Correlation between Columns')

In [None]:
# Simple function for ploting the graphs
def ploting_graph(cat):
    for i in cat:
        plt.figure(figsize=(8,5))
        sns.barplot(x=i,y='output',data=df_train)
        plt.title(f'{i} vs Output')

In [None]:
ploting_graph(cat)

**From the above graphs we can say that the fbs column which is fasting blood sugar does not really give us clear indication of the heart attack. We will drop this column in our training set.**

# Label Encoding: binning_column

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [None]:
for i in num:
    df_train[i]=pd.cut(df_train[i].astype(int),4)
    df_train[i]=le.fit_transform(df_train[i])

**We are adding two columns 'chol' and 'trtbps' to create a new column named 'bpchol'**

In [None]:
df_train['bpchol']=df_train['chol']+df_train['trtbps']

**We are droping four columns including 'output' column. 'chol' and 'trtbps' columns are dropped because we have a new column with their values. The reason for dropping 'fbs' has already been discussed.**

In [None]:
drop_column=['chol','trtbps','fbs','output']

In [None]:
# Plotting graphs after binning and label-encoding those numerical columns
ploting_graph(num)

**Splitting 'output' column(which we will be predicting) from the dataset.**

In [None]:
df_y=df_train['output']
df_train.drop(drop_column,axis=1,inplace=True)

In [None]:
df_train.head()

# Training Model
**Importing libraries for Model creating, testing and predicting**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

**Standard Scaling all the columns before splitting into test and train sets.**

In [None]:
ss=StandardScaler()
scale_train=ss.fit_transform(df_train)
df_scaled_train=pd.DataFrame(scale_train,columns=df_train.columns)

**Splitting train and test sets. (80/20)**

In [None]:
X_train, X_test, y_train, y_test=train_test_split(df_scaled_train, df_y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
#A dictionary of ML Models for cross-validating results of different algorithms.
# This Dictonary is adapted from the link: https://www.kaggle.com/rashikrahmanpritom/heart-attack-prediction?rvi=1&scriptVersionId=57435788&cellId=14
model_list = {  
    'KNeighborsClassifier': { 
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [2,3,4,5,6,7,18,19,20],
            'algorithm' : ['auto','ball_tree'],
            'weights' : ['uniform','distance'],
            'leaf_size' : [27,28,29,30,31]
        }
    },
    
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            'max_depth' : [None,1,2,6,5]
        }
    },
     
    
     'LOGISTIC_REGRESSION': {
        'model': LogisticRegression(),
        'params': {
            'C': [1,2,3,4,5,6,7],
            'solver' : [ 'liblinear', 'lbfgs'],
            'multi_class' : ['auto', 'ovr' ]
        }
    },
    
        
    'SVM': {
        'model': SVC(),
        'params': {
             'C': [1,2,3,5,6,7],
             'kernel': ['rbf','linear'],
             'gamma': ['auto', 'scale']
        }
    },
       
    'RANDOM_FOREST':{
        'model' : RandomForestClassifier(),
        'params': {
            'n_estimators':[1,2,3,4,5,10,15],
            'criterion': ['entropy'],
            'random_state' : [12,13],
            'max_depth' : [5,6]

        }
    }
}

In [None]:
for model_name, mp in model_list.items():
    print(f'model_name: {model_name}')
    clf = GridSearchCV(mp['model'], mp['params'], cv=15)
    clf.fit(X_train,y_train)
    print(f'Best score:{clf.best_score_}')
    print(f'Best parameters:{clf.best_params_}\n\n')

**As SVM shows most promising result. We will go and predict the output with this one.**

In [None]:
svm=SVC(C=1, gamma='auto', kernel='rbf')
svm.fit(X_train,y_train)
predict=svm.predict(X_test)
print(classification_report(y_test,predict))

In [None]:
plot_confusion_matrix(svm,X_test,y_test,cmap='viridis')
plt.show()

In [None]:
# Thank you all for your patience. I am new to ML. Hope, this will help some beginners like me.