In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, roc_curve, roc_auc_score

In [None]:
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
data.head()

# DATA WRANGLING

In [None]:
print("Shape:")
print(data.shape)
print("-"*100)
print("Columns Name:")
columns = data.columns
print(columns)
print("-"*100)
print("Data Information:")
data.info()
print("-"*100)
print("Data Description: ")
print(data.describe())
print("-"*100)
print("Counting Null Values:")
print(data.isnull().sum())
print("-"*100)
print("Data correlation: ")
print(data.corr())
print("-"*100)

# DATA VISUALIZATION
## Pearson Correlation

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(data.corr(method="pearson"),annot=True,cmap='YlGnBu_r', vmin=-1, vmax=+1)
plt.title("PEARSON")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()


## Histogram

In [None]:
data.hist(figsize=(20,20))
plt.show()

In [None]:
sns.countplot(data["output"])
print(data.groupby(["output"]).count())

# 0 -> No Heart Disease, 1 -> Heart Disease

In [None]:
sns.kdeplot(data[data['output']==1]['chol'],shade=True,color="orange", label="Unwell", alpha=.7)
sns.kdeplot(data[data['output']==0]['chol'],shade=True,color="dodgerblue", label="Healthy", alpha=.7)
plt.title('Cholesterol in mg/d for both case')
plt.show()

## Heat Map

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(data)

## Pair Plot

In [None]:
sns.pairplot(data, hue="output")

## Feature Scaling

In [None]:
X = data.drop("output",axis=1)
y = data["output"]

In [None]:
std_scl = StandardScaler()

X = std_scl.fit_transform(X)
print(X)

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(X)

## Train Test Splitting

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Training Models
### Classification Models

In [None]:
lr = LogisticRegression(solver="liblinear").fit(x_train,y_train)
gnb = GaussianNB().fit(x_train,y_train)
knnc = KNeighborsClassifier().fit(x_train,y_train)
dtc = DecisionTreeClassifier(random_state=42).fit(x_train,y_train)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(x_train,y_train)
xgbc = XGBClassifier().fit(x_train,y_train)
catbc = CatBoostClassifier(verbose=False).fit(x_train,y_train)


In [None]:
model_names = [lr,gnb,knnc,dtc,rfc,xgbc,catbc]

In [None]:
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict(x_test)
    CV = cross_val_score(model,x_test,y_test,cv=10,verbose=False).mean()
    error = -cross_val_score(model,x_test,y_test,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    print(name + ": ")
    print("-" * 50)
    print("Accuracy Score: ",accuracy_score(y_test,predict))
    print("Cross Validation Score: ",CV)
    print("Error: ",np.sqrt(error))
    print("R-square value: ",r2_score(y_test,predict))
    print("Confusion matrix: ")
    confu_matrix=confusion_matrix(y_test,predict)
    plt.title("Confusion Matrix", fontsize=10)
    sns.heatmap(confu_matrix, annot=True)
    plt.show()
    print("-" * 100)

## Model Comparison
### Accuracy Comparison

In [None]:
df = pd.DataFrame(columns=["MODELS","Accuracy"])
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test,predict)
    result = pd.DataFrame([[name,accuracy*100]],columns=["MODELS","Accuracy"])
    df = df.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="Accuracy",y="MODELS",data=df,color="k")
plt.xlabel("ACCURACY")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()

### Cross Validation Comparison

In [None]:
df = pd.DataFrame(columns=["MODELS","CV"])
for model in model_names:
    name = model.__class__.__name__
    CV = cross_val_score(model,x_test,y_test,cv=10,verbose=False).mean()
    result = pd.DataFrame([[name,CV*100]],columns=["MODELS","CV"])
    df = df.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="CV",y="MODELS",data=df,color="k")
plt.xlabel("CV")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL CROSS VALIDATION COMPARISON")
plt.show()

## ROC CURVE and ROC_AUC_SCORE
### Prediction Probability

In [None]:
r_prob = [0 for _ in range(len(y_test))]
r_auc = roc_auc_score(y_test,r_prob)

### AUROC_Score

In [None]:
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict_proba(x_test)[:,1]
    auroc_score = roc_auc_score(y_test,predict)
    print(name+" score: ",auroc_score)
    print("-"*50)

### ROC_Curve

In [None]:
r_fpr,r_tpr,_= roc_curve(y_test,r_prob)
model_dict={}

for model in model_names:
    name = model.__class__.__name__
    predict = model.predict_proba(x_test)[:,1]
    fpr,tpr,_= roc_curve(y_test,predict)
    model_dict[name]=[fpr,tpr]
    


In [None]:
plt.plot(r_fpr,r_tpr,linestyle="--")
plt.plot(model_dict["LogisticRegression"][0],model_dict["LogisticRegression"][1],linestyle='dotted',label='LogisticRegression')
plt.plot(model_dict["GaussianNB"][0],model_dict["GaussianNB"][1],linestyle='dotted',label='GaussianNB')
plt.plot(model_dict["KNeighborsClassifier"][0],model_dict["KNeighborsClassifier"][1],linestyle='dotted',label='KNeighborsClassifier')
plt.plot(model_dict["DecisionTreeClassifier"][0],model_dict["DecisionTreeClassifier"][1],linestyle='dotted',label='DecisionTreeClassifier')
plt.plot(model_dict["RandomForestClassifier"][0],model_dict["RandomForestClassifier"][1],linestyle='dotted',label='RandomForestClassifier')
plt.plot(model_dict["XGBClassifier"][0],model_dict["XGBClassifier"][1],linestyle='dotted',label='XGBClassifier')
plt.plot(model_dict["CatBoostClassifier"][0],model_dict["CatBoostClassifier"][1],linestyle='dotted',label='CatBoostClassifier')

plt.title("ROC plot")
plt.xlabel("False positive rate.")
plt.ylabel("True positive rate.")
plt.legend()
plt.show()