In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # Seaborn visualization library
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hi there, to get started lets import and visualize the data

**Import the data**

In [None]:
heart_df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

**Lets see how much data we workin with**

In [None]:
heart_df.shape

**Small dataset but nothing to worry about**

**Lets look at the features**

In [None]:
heart_df.sample(6)

**Any values missing?**

In [None]:
heart_df.isnull().sum().sort_values(ascending=False)[:]

In [None]:
heart_df.isna().any()

**Hm complete dataset, right on**

In [None]:
heart_df.nunique()

**Couple features that I want to label as categories since some are disretized (sex, cp, fbs, restecg, exng, slp, caa, thall)**

In [None]:
heart_df.describe()

In [None]:
heart_df.dtypes

**Correlation time**

In [None]:
# Compute the correlation matrix
corr = heart_df.corr(method ='pearson')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
heart_df.corr(method ='pearson')

**Some features are displaying some correlation - gives us hope we may have good data to train the model on**

**Specifically features cp, thalachh, excng, old peak and caa may have signifigance. Plotting the categories will diplay if the counts are balanced**

In [None]:
# Plot counts vs. cat features
sig_cat_feats = ["cp", "exng", "caa"]
for i in sig_cat_feats:
    sns.set_theme(style="darkgrid")
    ax = sns.countplot(data=heart_df, x=i)
    plt.show()

**Lets check out how balanced the labels are (labels = prone to a heart attack or not)**

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=heart_df, x="output")
plt.show()

**Pretty balanced, way better than the stroke dataset I worked on previously**

# Okay enough data drooling, lets split the data and preprocess

**Lets split up the data:
train = 75%  |  test = 25%**

In [None]:
# Breakdown the data frame into attributes and label
X = heart_df.drop('output', axis=1)
y = heart_df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

**Set up pipelines:
Scale the integer features,
Categorize the discrete features with OneHot** 

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
heart_df.head()

**Breakdown heart dataframe into categories and numeric sections - this will prep the data for the pipeline**

In [None]:
numerical_feats = ["age", "trtbps", "chol", "thalachh", "oldpeak"]
categorical_feats = ["sex", "cp", "fbs", "restecg", "exng", "slp", "caa", "thall"]

**Combine the category and numerical transformers as a preprocessor**

In [None]:
# Data cleaning and transforming
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_feats),
        ('cat', categorical_transformer, categorical_feats)])

# Model and Run

**List the classifiers as a list (will be easy to go back and add classifiers)**

In [None]:
# Run multiple models and compare
classifiers = [
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(),
    ]

**Loop through the classifiers and run Pipeline and fit functions on the trainig set then predict test set based on the model**

In [None]:
print("** Following results reflect classifier models **")
classif_list = []
acc_list = []
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    
    # Predict the test labels
    preds = pipe.predict(X_test)
    
    # Calculate the accuracy and cofussion matrix
    acc_score = accuracy_score(y_test, preds)
    conf_matrix = confusion_matrix(y_test, preds)
    
    # Calculate how many preidctions were right and wrong
    n_labels_right = accuracy_score(y_test, preds, normalize=False)
    n_labels_total = y_test.size
      
    # Print details of classifiers, accuracy and cofussion matrix
    print("-------------------------------------------")
    print(classifier)
    print("Accuracy: ", acc_score*100)
    print("Predictions correct = ", n_labels_right)
    print("Predictions wrong   = ", n_labels_total - n_labels_right)
    print("Confussion matrix = \n", conf_matrix)
    
    # Put classifier and accuracy in list to extract the best at the end
    classif_list.append(classifier)
    acc_list.append(acc_score*100)

**See which model predicted the best**

In [None]:
# Output the best model by accuracy
print("-------------------------------------------")
print("** The Best Model Goes to ... **")
best_acc = max(acc_list)
index = acc_list.index(max(acc_list))
best_classif = classif_list[index]
print("The best accuracy model = ", best_classif)
print("With an accuracy = ", best_acc)
print("Complete.")