In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("../input/heart-disease-prediction/Heart_Disease_Prediction.csv")

In [None]:
df.head(15)

In [None]:
df.isnull().sum()

In [None]:
label=df["Heart Disease"]
df.drop("Heart Disease",axis=1,inplace=True)

In [None]:
label.value_counts().plot(kind="bar")

In [None]:
df.dtypes

In [None]:
categorical_features=["Sex","Chest pain type","FBS over 120","EKG results","Exercise angina","Slope of ST","Number of vessels fluro","Thallium"]

df[categorical_features]=df[categorical_features].astype("category")

In [None]:
df.dtypes

In [None]:
continuous_features=set(df.columns)-set(categorical_features)
scaler=StandardScaler()
df_norm=df.copy()
df_norm[list(continuous_features)]=scaler.fit_transform(df[list(continuous_features)])
#df_norm=pd.DataFrame(df_norm,columns=list(continuous_features))

In [None]:
df_norm_dummies=pd.get_dummies(df_norm)

In [None]:
df_norm_dummies

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_norm_dummies,label,test_size=0.15,stratify=label,random_state=10)

In [None]:
models=[SVC(),LogisticRegression(),GaussianNB(),DecisionTreeClassifier()]

for model in models:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,cmap=plt.cm.Blues)
    plt.show()
    print(classification_report(y_test,y_pred))

###### We can see from the above <strong>**confusion matrix**</strong> and <strong>**classification report**</strong> that the model which reached the best result is the **Support Vector Machine Classifier** with an accuracy of <strong>**93%**</strong>

Let us now try to explore and view the data from another angle and perspective.

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df_norm.corr(),cmap=plt.cm.Blues,annot=True)

In [None]:
sns.pairplot(df_norm[continuous_features])

From the correlation matrix and the pair-element plotting in seaborn we can see that some features are somehow correlated ( eventhough it is a weak correlation ) like **Age-Cholesterol** | **BP-ST Depression**. 

We will try to perform some Dimensionality reduction on the dataset and evaluate the model for each dimension.

In [None]:
n_dimensions=[i for i in range(2,len(df.columns))]

for dim in n_dimensions:
    pca=PCA(dim)
    df_norm_reduc=pd.DataFrame(pca.fit_transform(df_norm),columns=[i for i in range(dim)])
    
    X_train,X_test,y_train,y_test=train_test_split(pd.get_dummies(df_norm_reduc),label,test_size=0.15,stratify=label,random_state=10)
    model=LogisticRegression()
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,cmap=plt.cm.Blues)
    plt.show()
    print(classification_report(y_test,y_pred))

After running Principal Component Analysis on the dataset, trying different dimensions and training the SVC model which performed best on all set of features. We can observe that the accuracy score did not get better or even get closer to the optimal accuracy we reached so far which is **93%**.