Importing Dependencies

In [55]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score


Data Collection and Preprocesiing

In [56]:
# Load dataset is a pandas dataframe
heart_data = pd.read_csv('C:/Users/USER/Desktop/Datasets/HeartDiseaseTrain-Test.csv')

In [57]:
heart_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0


In [58]:
# check number of rows and columns
heart_data.shape

(1025, 14)

In [59]:
# Check data basic information about the dataset 
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1025 non-null   int64  
 1   sex                            1025 non-null   object 
 2   chest_pain_type                1025 non-null   object 
 3   resting_blood_pressure         1025 non-null   int64  
 4   cholestoral                    1025 non-null   int64  
 5   fasting_blood_sugar            1025 non-null   object 
 6   rest_ecg                       1025 non-null   object 
 7   Max_heart_rate                 1025 non-null   int64  
 8   exercise_induced_angina        1025 non-null   object 
 9   oldpeak                        1025 non-null   float64
 10  slope                          1025 non-null   object 
 11  vessels_colored_by_flourosopy  1025 non-null   object 
 12  thalassemia                    1025 non-null   o

In [60]:
# Checking for missing values 
heart_data.isna().sum()

age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

In [61]:
# Statistical measures of the data 
heart_data.describe()

Unnamed: 0,age,resting_blood_pressure,cholestoral,Max_heart_rate,oldpeak,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,131.611707,246.0,149.114146,1.071512,0.513171
std,9.07229,17.516718,51.59251,23.005724,1.175053,0.50007
min,29.0,94.0,126.0,71.0,0.0,0.0
25%,48.0,120.0,211.0,132.0,0.0,0.0
50%,56.0,130.0,240.0,152.0,0.8,1.0
75%,61.0,140.0,275.0,166.0,1.8,1.0
max,77.0,200.0,564.0,202.0,6.2,1.0


In [62]:
# Checking the distribution of target variable 
heart_data['target'].value_counts()

target
1    526
0    499
Name: count, dtype: int64

In [63]:
# Checking the distribution of target variable 
heart_data['chest_pain_type'].value_counts()

chest_pain_type
Typical angina      497
Non-anginal pain    284
Atypical angina     167
Asymptomatic         77
Name: count, dtype: int64

In [64]:
# Checking the distribution of target variable 
heart_data['fasting_blood_sugar'].value_counts()

fasting_blood_sugar
Lower than 120 mg/ml      872
Greater than 120 mg/ml    153
Name: count, dtype: int64

In [65]:
# Checking the distribution of target variable 
heart_data['rest_ecg'].value_counts()

rest_ecg
ST-T wave abnormality           513
Normal                          497
Left ventricular hypertrophy     15
Name: count, dtype: int64

In [66]:
# Checking the distribution of target variable 
heart_data['exercise_induced_angina'].value_counts()

exercise_induced_angina
No     680
Yes    345
Name: count, dtype: int64

In [67]:
# Checking the distribution of target variable 
heart_data['vessels_colored_by_flourosopy'].value_counts()

vessels_colored_by_flourosopy
Zero     578
One      226
Two      134
Three     69
Four      18
Name: count, dtype: int64

In [68]:
# Checking the distribution of target variable 
heart_data['thalassemia'].value_counts()

thalassemia
Fixed Defect         544
Reversable Defect    410
Normal                64
No                     7
Name: count, dtype: int64

In [69]:
# Checking the distribution of target variable 
heart_data['slope'].value_counts()

slope
Flat           482
Downsloping    469
Upsloping       74
Name: count, dtype: int64


**Label Enconding**

In [70]:
pd.set_option('future.no_silent_downcasting', True)


# encoding categorical features with numerical values 
heart_data.replace({'chest_pain_type':{'Typical angina':0, 'Non-anginal pain':1, 'Atypical angina':2, 'Asymptomatic':3}}, inplace=True)
heart_data.replace({'fasting_blood_sugar':{'Lower than 120 mg/ml':0, 'Greater than 120 mg/ml':1}},  inplace=True)
heart_data.replace({'rest_ecg':{'ST-T wave abnormality':0, 'Normal':1, 'Left ventricular hypertrophy':2}},  inplace=True)
heart_data.replace({'exercise_induced_angina':{'No':0, 'Yes':1}},  inplace=True)
heart_data.replace({'vessels_colored_by_flourosopy':{'Zero':0, 'One':1, 'Two':2, 'Three':3, 'Four':4}},  inplace=True)
heart_data.replace({'thalassemia':{'Fixed Defect':0, 'Reversable Defect':1, 'Normal':2, 'No':3}},  inplace=True)
heart_data.replace({'slope':{'Flat':0, 'Downsloping':1, 'Upsloping':2}},  inplace=True)
heart_data.replace({'sex':{'Male':0, 'Female':1}},  inplace=True)


In [71]:
heart_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,0,0,125,212,0,0,168,0,1.0,1,2,1,0
1,53,0,0,140,203,1,1,155,1,3.1,2,0,1,0
2,70,0,0,145,174,0,0,125,1,2.6,2,0,1,0
3,61,0,0,148,203,0,0,161,0,0.0,1,1,1,0
4,62,1,0,138,294,1,0,106,0,1.9,0,3,0,0



**Seperating the Features and Target**

In [72]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [73]:
print(X)

      age sex chest_pain_type  resting_blood_pressure  ...  oldpeak slope vessels_colored_by_flourosopy  thalassemia
0      52   0               0                     125  ...      1.0     1                             2            1
1      53   0               0                     140  ...      3.1     2                             0            1
2      70   0               0                     145  ...      2.6     2                             0            1
3      61   0               0                     148  ...      0.0     1                             1            1
4      62   1               0                     138  ...      1.9     0                             3            0
...   ...  ..             ...                     ...  ...      ...   ...                           ...          ...
1020   59   0               2                     140  ...      0.0     1                             0            0
1021   60   0               0                     125  ...      

In [74]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64



**Seperate data into Training data and Test data**

In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, stratify=Y, random_state=13)

In [76]:
print(X.shape, X_train.shape, X_test.shape)

(1025, 13) (871, 13) (154, 13)


#### Model Training: LogisticRegression

In [77]:
model = LogisticRegression()

In [78]:
model.fit(X_train, Y_train)

**Model Evaluation**

In [79]:
# accuracy on training data 
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [80]:
print('Accuracy on training data :', training_data_accuracy)

Accuracy on training data : 0.8404133180252583


In [81]:
# accuracy on test data 
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [82]:
print('Accuracy on test data :', test_data_accuracy)

Accuracy on test data : 0.8441558441558441


#### Building Predictive System

In [None]:
input_data = ()

# change the input data to numpy array 
input_data = np.asarray(input_data)

# reshape the numpy array for prediction on one instance 
input_data = input_data.reshape(1, -1)

prediction = model.predict(input_data)

if (prediction[0] == 0):
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')
