# HEART DISEASE PREDICTION

In [2]:
#load the required modules for the dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [3]:
#loading the csv file to jupyter notebook
df=pd.read_csv('Heart_Disease_Prediction.csv')

In [4]:
#displaying the first and last 5 rows of the dataset
df

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [5]:
df.shape

(270, 14)

In [39]:
#give brief information of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

In [40]:
#check missing values
df.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [41]:
#statistical measure of data
df.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [42]:
#check wether person has heart disease or not
df['Heart Disease'].value_counts()

Absence     150
Presence    120
Name: Heart Disease, dtype: int64

In [43]:
#splitting the dataset into 2 parts for  testing and training
x=df.drop(columns='Heart Disease',axis=1)
y=df['Heart Disease']

In [44]:
print(y)

0      Presence
1       Absence
2      Presence
3       Absence
4       Absence
         ...   
265     Absence
266     Absence
267     Absence
268     Absence
269    Presence
Name: Heart Disease, Length: 270, dtype: object


# SPLITTING DATA INTO TRAINING DATA AND TESTING DATA


In [45]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,stratify=y,random_state=2)

In [46]:
print(x.shape,x_train.shape,x_test.shape)

(270, 13) (189, 13) (81, 13)


# MODEL TRAINING LOGISTIC REGRESSION

In [47]:
model=LogisticRegression(solver='lbfgs',max_iter=1000)

In [48]:
#train model with training data
model.fit(x_train,y_train)

# MODEL EVALUATION

In [49]:
#accuracy on training data
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_prediction,y_train)

In [50]:
print('Accuracy on training data :',training_data_accuracy)

Accuracy on training data : 0.8835978835978836


In [51]:
#accuracy score on test data
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(x_test_prediction,y_test)

In [52]:
print('Accuracy on test data :',test_data_accuracy)

Accuracy on test data : 0.8148148148148148


# PREDICTIVE SYSTEM

In [53]:
#building an predective model
y_prediction=model.predict(x_test)

In [55]:
accuracy=accuracy_score(y_test,y_prediction)

In [56]:
print('Accuracy :',accuracy)

Accuracy : 0.8148148148148148


In [59]:
confusion=confusion_matrix(y_test,y_prediction)

In [60]:
print('confusion matrix :',confusion)

confusion matrix : [[38  7]
 [ 8 28]]


In [63]:
classification=classification_report(y_test,y_prediction)

In [64]:
print('classification report :',classification)

classification report :               precision    recall  f1-score   support

     Absence       0.83      0.84      0.84        45
    Presence       0.80      0.78      0.79        36

    accuracy                           0.81        81
   macro avg       0.81      0.81      0.81        81
weighted avg       0.81      0.81      0.81        81



.The precision of a model is the proportion of true positive predictions out of all positive predictions. In this case, the precision for the “Absence” class is 0.83, which means that out of all the predictions made by the model for the “Absence” class, 83% were correct. Similarly, the precision for the “Presence” class is 0.80, which means that out of all the predictions made by the model for the “Presence” class, 80% were correct.

.The recall of a model is the proportion of true positive predictions out of all actual positive cases. In this case, the recall for the “Absence” class is 0.84, which means that out of all the actual cases of “Absence”, the model correctly identified 84%. Similarly, the recall for the “Presence” class is 0.78, which means that out of all the actual cases of “Presence”, the model correctly identified 78%.

.The f1-score is the harmonic mean of precision and recall. It is a measure of the balance between precision and recall. The f1-score for the “Absence” class is 0.84, and the f1-score for the “Presence” class is 0.79.

.The support is the number of samples of the true response that lie in that class. In this case, the support for the “Absence” class is 45, and the support for the “Presence” class is 36.

.The accuracy of the model is the proportion of correct predictions out of all predictions made. In this case, the accuracy of the model is 0.81, which means that 81% of the predictions made by the model were correct.