# IMPORTING THE REQUIRED LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# READING THE DATASET

In [3]:
df = pd.read_csv("heart.csv")

# CHECKING THE SIZE OF THE DATA

In [5]:
df.shape

(303, 14)

# DESCRIBING THE DATASET

In [6]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


# CHECKING THE NULL VALUES

In [7]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

# PRINTING COLUMN NAMES

In [8]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

# CHECKING CORRELATION BETWEEN DEPENDENT AND INDEPENDENT VARIABLES

In [9]:
print("CORRELATIONS")
print("")
for i in df.columns:
  if i == 'target':
    break
  else:
    print(i,"------->",df[i].corr(df['target']))
    print("")

CORRELATIONS

age -------> -0.22543871587483727

sex -------> -0.2809365755017666

cp -------> 0.4337982615068934

trestbps -------> -0.14493112849775147

chol -------> -0.08523910513756904

fbs -------> -0.028045760272712827

restecg -------> 0.13722950287377336

thalach -------> 0.42174093381067435

exang -------> -0.4367570833533018

oldpeak -------> -0.4306960016873684

slope -------> 0.34587707824172526

ca -------> -0.3917239923512519

thal -------> -0.3440292680383098



# IMPORTING REQUIRED MODELS AND METRICS

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Train Test Split
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# RENAMING THE COLUMN NAMES

In [11]:
df.rename(columns={"cp": "chest_pain",
                   "trtbps":"blood_pressure",
                   "chol":"cholestoral",
                   "fbs":"fasting_blood_sugar",
                   "thalachh":"max_heart_rate",
                   "exng":"angina",
                   "caa": "n_vessels",
                   "target":"heart_attack"},
          inplace=True)

# DIVIDING THE DEPENDENT AND INDEPENDENT VARIABLES

In [12]:
y = df['heart_attack']
X = df.drop(['heart_attack'],axis = 1)

# DIVIDING THE DATA INTO TRAIN AND TEST

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [14]:
lst = [x_train,x_test,y_train,y_test]
for data in lst:
  print("shape","---->",data.shape)

shape ----> (242, 13)
shape ----> (61, 13)
shape ----> (242,)
shape ----> (61,)


# MAKING PIPELINE FOR EVERY MODEL

In [15]:
mdl_1 = make_pipeline(StandardScaler(), KNeighborsClassifier()) 
mdl_2 = make_pipeline(StandardScaler(), LogisticRegression()) 
mdl_3 = make_pipeline(StandardScaler(), SVC()) 
mdl_4 = make_pipeline(StandardScaler(), DecisionTreeClassifier()) 
mdl_5 = make_pipeline(StandardScaler(), RandomForestClassifier())

# FITTING AND PREDICTING ACCURACY OF MODELS

In [16]:
mdl_lst = [mdl_1,mdl_2,mdl_3,mdl_4,mdl_5]
accuracy_lst = []
for mdl in mdl_lst:
  i = 1
  mdl.fit(x_train,y_train)
  y_pred = mdl.predict(x_test)
  print("")
  print("******CLASSIFICATION MODEL******")
  i = i+1
  print("")
  print("CONFUSION MATRIX")
  print(confusion_matrix(y_test,y_pred))
  print("")
  print("ACCURACY SCORE")
  print(accuracy_score(y_test,y_pred))
  print("----------------------------")
  accuracy_lst.append(accuracy_score(y_test,y_pred)) 
 


******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[27  2]
 [ 4 28]]

ACCURACY SCORE
0.9016393442622951
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[25  4]
 [ 5 27]]

ACCURACY SCORE
0.8524590163934426
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[26  3]
 [ 5 27]]

ACCURACY SCORE
0.8688524590163934
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[27  2]
 [ 7 25]]

ACCURACY SCORE
0.8524590163934426
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[24  5]
 [ 4 28]]

ACCURACY SCORE
0.8524590163934426
----------------------------


# CHECKING FOR HIGH ACCURACY SCORE AMOUNG THE MODELS

In [17]:
print(accuracy_lst.index(max(accuracy_lst)))


0


# SAVING THE MODEL IN PICKLE FILE

In [18]:
import pickle

In [19]:
filename = 'heart_model.pkl'
pickle.dump(mdl_1, open(filename, 'wb'))
 