<a href="https://colab.research.google.com/github/shawn160803/Heart-Disease-Predictor/blob/main/Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [None]:
#  loading the csv data to a Pandas Dataframe
heart_data = pd.read_csv('/content/data.csv')

In [None]:
# print first 5 rows of the dataset
heart_data.head()


Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,1.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0


In [None]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
1185,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,1.0
1186,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,1.0
1187,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0
1188,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0
1189,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0


In [None]:
# number of rows and columns in dataset
heart_data.shape

(1190, 12)

In [None]:
# getting some info of data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   float64
 1   sex                  1190 non-null   float64
 2   chest_pain_type      1190 non-null   float64
 3   resting_bp_s         1190 non-null   float64
 4   cholesterol          1190 non-null   float64
 5   fasting_blood_sugar  1190 non-null   float64
 6   resting_ecg          1190 non-null   float64
 7   max_heart_rate       1190 non-null   float64
 8   exercise_angina      1190 non-null   float64
 9   oldpeak              1190 non-null   float64
 10  st_slope             1190 non-null   float64
 11  target               1190 non-null   float64
dtypes: float64(12)
memory usage: 111.7 KB


In [None]:
# checking for missing values
heart_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
chest_pain_type,0
resting_bp_s,0
cholesterol,0
fasting_blood_sugar,0
resting_ecg,0
max_heart_rate,0
exercise_angina,0
oldpeak,0


In [None]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [None]:
# checking the distribution of target variable
heart_data['target'].value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1.0,629
0.0,561


1 --> Defective Heart
0 --> Healthy Heart

Splitting the Features and Target

In [None]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
print(X)

       age  sex  chest_pain_type  resting_bp_s  ...  max_heart_rate  exercise_angina  oldpeak  st_slope
0     40.0  1.0              2.0         140.0  ...           172.0              0.0      0.0       1.0
1     49.0  0.0              3.0         160.0  ...           156.0              0.0      1.0       2.0
2     37.0  1.0              2.0         130.0  ...            98.0              0.0      0.0       1.0
3     48.0  0.0              4.0         138.0  ...           108.0              1.0      1.5       2.0
4     54.0  1.0              3.0         150.0  ...           122.0              0.0      0.0       1.0
...    ...  ...              ...           ...  ...             ...              ...      ...       ...
1185  45.0  1.0              1.0         110.0  ...           132.0              0.0      1.2       2.0
1186  68.0  1.0              4.0         144.0  ...           141.0              0.0      3.4       2.0
1187  57.0  1.0              4.0         130.0  ...           11

In [None]:
print(Y)

0       0.0
1       1.0
2       0.0
3       1.0
4       0.0
       ... 
1185    1.0
1186    1.0
1187    1.0
1188    1.0
1189    0.0
Name: target, Length: 1190, dtype: float64


Splitting The data into training data and Test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1190, 11) (952, 11) (238, 11)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# Training the LogisticRegression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [None]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8266806722689075


In [None]:
# accuracy Score of text data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8109243697478992


Building A Predictive System

In [None]:
input_data = (40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0)

# changing data into numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshaping the numpy array for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[0.]
The Person does not have a Heart Disease


