In [None]:
import numpy as np 
import pandas as pd

In [None]:
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df= data.copy()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

**checking null value if any**

In [None]:
df.isna().sum()

**Checking outliers**

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(df['age'])

In [None]:
sns.boxplot(df['trestbps'])

In [None]:
sns.boxplot(df['chol'])

In [None]:
sns.boxplot(df['thalach'])

In [None]:
sns.boxplot(df['oldpeak'])

**Removing outliers**

In [None]:
all_outliers=['trestbps','oldpeak','thalach','chol']

In [None]:
from numpy import quantile
for outlier in range(len(all_outliers)):
    med= df[all_outliers[outlier]].median()
    q15= df[all_outliers[outlier]].quantile(0.25)
    q85= df[all_outliers[outlier]].quantile(0.75)
    IQR= (q85-q15)*1.5
    low, upp= q15-IQR, q85+IQR
    df[all_outliers[outlier]]= df[all_outliers[outlier]].apply(lambda x: med if x<low else x)
    df[all_outliers[outlier]]= df[all_outliers[outlier]].apply(lambda x: med if x>upp else x)

In [None]:
df.head(2)

# Exploratory Data Analysis(EDA)

In [None]:
df.head(2)

In [None]:
# 0 --> not having heart disease
# 1 --> having heart disease

In [None]:
import matplotlib.pyplot as plt

In [None]:
categ=['sex','cp','fbs','restecg','exang','slope','ca','thal','target']
for i in range(len(categ)):
    sns.countplot(df[categ[i]])
    plt.show()

In [None]:
sns.distplot(df['age'])

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x=df['age'],hue=df['target'],data=df,palette="Set3")

In [None]:
sns.violinplot(x=df['target'],y=df['age'])

In [None]:
sns.violinplot(x=df['target'],y=df['chol'])

In [None]:
sns.distplot(df['thalach'])

In [None]:
sns.violinplot(x=df['target'],y=df['thalach'])

In [None]:
plt.figure(figsize=(20,15))
sns.catplot(data=df,orient='h')

In [None]:
df.head(2)

In [None]:
df['sex'].value_counts()

In [None]:
sns.violinplot(x=df['sex'],y=df['target'])

In [None]:
sns.countplot(x='sex',data=df,hue='target')

In [None]:
sns.violinplot(x=df['target'],y=df['trestbps'])

In [None]:
sns.violinplot(x=df['target'],y=df['oldpeak'])

In [None]:
df['fbs'].value_counts()

In [None]:
sns.violinplot(x=df['target'],y=df['fbs'])

In [None]:
sns.countplot(x=df['fbs'],data=df,hue='target')

# Data Correlation

In [None]:
col=['sex','cp','fbs','restecg','exang','slope','ca','thal']
df_new=pd.get_dummies(df, columns=col)
df_new.head()

In [None]:
df_new.corr()

In [None]:
plt.figure(figsize=(22,22))
sns.heatmap(df_new.corr(),cmap='coolwarm',annot=True)

In [None]:
plt.figure(figsize=(14,6))
df_new.drop('target', axis=1).corrwith(df_new.target).plot(kind = 'bar', grid = True,title = "Correlation with target")

In [None]:
df_new.head(1)

**Taking threshold correlation of data features with target as greater than 0.4 or less than -0.4 we concludes that features --> ['thalach','oldpeak','cp_0','exang_0','exang_1','ca_0','thal_2','thal_3'] are highly important**

# TRAINING AND TESTING THE DATA USING ML MODELS

In [None]:
X=df_new[['thalach','oldpeak','cp_0','exang_0','exang_1','ca_0','thal_2','thal_3']]
X.head()

In [None]:
y=df_new['target']
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=40)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
x_train

In [None]:
x_train.shape, y_train.shape

In [None]:
# logistic regression

log=LogisticRegression()
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of LogisticRegression model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of LR model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of LR model   --->   ',f1_score(y_test,pred))

In [None]:
# RandomForestClassifier
l=[]
j=[]
for i in range(1,150):
    log=RandomForestClassifier(n_estimators=i)
    log.fit(x_train,y_train)
    pred=log.predict(x_test)
    ac=round((accuracy_score(pred,y_test)*100),2)
    l.append(ac)
    j.append(i)
maxi=l.index(max(l))
ind=j[maxi]
# print(max(l),ind)

log=RandomForestClassifier(n_estimators=ind)
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of RandomForestClassifier model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of RandomForestClassifier model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of RandomForestClassifier model   --->   ',f1_score(y_test,pred))

In [None]:
# SVC

log=SVC(kernel='sigmoid',C=0.1)
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of SVC model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of SVC model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of SVC model   --->   ',f1_score(y_test,pred))

In [None]:
# DecisionTreeClassifier

log=DecisionTreeClassifier()
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of DecisionTreeClassifier model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of DecisionTreeClassifier model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of DecisionTreeClassifier model   --->   ',f1_score(y_test,pred))

In [None]:
# KNeighborsClassifier
l=[]
j=[]
for i in range(1,100):
    log=KNeighborsClassifier(n_neighbors=i)
    log.fit(x_train,y_train)
    pred=log.predict(x_test)
    ac=round((accuracy_score(pred,y_test)*100),2)
    l.append(ac)
    j.append(i)
maxi=l.index(max(l))
ind=j[maxi]
log=KNeighborsClassifier(n_neighbors=ind)
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of KNeighborsClassifier model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of KNeighborsClassifier model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of KNeighborsClassifier model   --->   ',f1_score(y_test,pred))

In [None]:
# XGBClassifier

log=XGBClassifier()
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of XGBClassifier model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of XGBClassifier model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of XGBClassifier model   --->   ',f1_score(y_test,pred))
# sns.heatmap(confusion_matrix(y_test,pred),cmap='coolwarm',annot=True)

In [None]:
# GradientBoostingClassifier

log=GradientBoostingClassifier(max_depth=4, n_estimators=700)
log.fit(x_train,y_train)
pred=log.predict(x_test)
acc=round((accuracy_score(pred,y_test)*100),2)
print('Accuracy of LogisticRegression model is  --->  ',acc)
print('\n')
print('Confusion Matrix   ---->  \n',confusion_matrix(y_test,pred))
print('\n')
print('precision_score of LR model   --->   ',precision_score(y_test,pred))
print('\n')
print('f1score of LR model   --->   ',f1_score(y_test,pred))

# WE HAVE ACHIEVED HIGHEST ACCURACY  OF 93.42 % USING KNN MODEL