In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score , f1_score , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

EDA

In [3]:
df = pd.read_csv('heart.csv')

In [4]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df.shape

(918, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [7]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [8]:
df.duplicated().sum()

0

In [9]:
df_encoded = pd.get_dummies(df , drop_first = True)

In [10]:
df_encoded.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [11]:
df_encoded = df_encoded.astype(int)

In [12]:
df_encoded.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [13]:
X= df_encoded.drop('HeartDisease' , axis=1)
y = df_encoded['HeartDisease']

In [14]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.20 , random_state =42)

In [15]:
#Feature Scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.fit_transform(X_test)

In [16]:
models ={
    "Logistic Regression Model":LogisticRegression(),
    "KNN":KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree":DecisionTreeClassifier(),
    "SVM":SVC()
}

In [17]:
result = []

In [18]:
for name , model in models.items():
    model.fit(X_train_scaled , y_train)

    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test , y_pred)

    f1 = f1_score(y_test , y_pred)

    result.append({
        'model':name,
        'Accuracy':round(acc , 4),
        "F1 score":round(f1,4)
    })



In [19]:
result

[{'model': 'Logistic Regression Model',
  'Accuracy': 0.8641,
  'F1 score': 0.8804},
 {'model': 'KNN', 'Accuracy': 0.8587, 'F1 score': 0.875},
 {'model': 'Naive Bayes', 'Accuracy': 0.8696, 'F1 score': 0.8857},
 {'model': 'Decision Tree', 'Accuracy': 0.7935, 'F1 score': 0.8224},
 {'model': 'SVM', 'Accuracy': 0.8804, 'F1 score': 0.8972}]

In [20]:
import joblib 

In [21]:
joblib.dump(models['SVM'], 'SVM_heart.pkl')


['SVM_heart.pkl']

In [22]:
joblib.dump(scaler,'scaler.pkl')

joblib.dump(X.columns.tolist() , 'columns.pkl')

['columns.pkl']