In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection & Processing

In [69]:
df = pd.read_csv(r'C:\Users\tuf\Downloads\heart_disease_data.csv')

In [70]:
# Checking first 5 rows
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [71]:
# Checking last 5 rows
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [68]:
# Checking number of rows and coumns
df.shape

(124, 1022)

In [5]:
# Getting information about the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
#Checking for any null/missing values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

As we can see there are no missing values so it is a preprocessed data

In [7]:
# Getting stastical information about the data
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


Above we can see the information regarding the dataset such as the total amount of dataset, minimum and maximun values of each dataset and we can check the standard deviation as well

In [29]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [8]:
# Checking the distribution of Target Variable
# Target data tells us whether the patient has heart dieseas or not
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

1 represents Unhealthy Heart

0 represents Healthy Heart

Here we see that out of 303 values 165 have heart dieseas and 138 do not

# Splitting Features & Target

In [42]:
X = df.drop('target',axis =1)
X = X.values # using values to avoid 'valid feature name' warning
Y = df['target']
Y = Y.values # using values to avoid 'valid feature name' warning

In [43]:
# Checking the training data
print(X)

[[63.  1.  3. ...  0.  0.  1.]
 [37.  1.  2. ...  0.  0.  2.]
 [41.  0.  1. ...  2.  0.  2.]
 ...
 [68.  1.  0. ...  1.  2.  3.]
 [57.  1.  0. ...  1.  1.  3.]
 [57.  0.  1. ...  1.  1.  2.]]


Target column has been removed from this data

In [57]:
# Checking target data
print(Y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]


In [45]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state = 2)

# Here we are using train_test_split function from sklearn library to split data into training and testing data (X & Y)
# test_size defines the percentage of data we would like to split,
# Here we have used 20% (test_size=0.2) of the data as a test data and 80% for training,
# stratify function divides the outcome of test data(Y) evenly in trainig data to get a better accuracy
# random_state function is used to split the data randomly, we can choose any number of our choice and it will be divided accordingly

# Model Training using Logistic Regression

In [46]:
model = LogisticRegression()

In [47]:
# training Logistic Regression model with Training data
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#  Model Evaluation

Accuracy Score

In [48]:
#Accuracy on Training Data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

#Printing the value to check the accuracy

print('Accuracy on Trainig data: ', training_data_accuracy)

Accuracy on Trainig data:  0.8512396694214877


Here we have accuracy score as 82.23% which is a good prediction as we have less number of dataset

In [49]:
# Accuracy on Test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Test data accuracy', test_data_accuracy)

Test data accuracy 0.819672131147541


Here the accuracy is 81.9%. An important thing here is to see the difference between the accuracy score of trainig and test data, it's always better to have less difference between them other it may result in 'overfitting' of the data

# Building a Predictive System 

In [74]:
# From the raw dataset I have chosen values from a random row excluding the Target dataset to predict the outcome

input_data = (35,0,0,138,183,0,1,182,0,1.4,2,0,2)

# Using Numpy to change the data to a numpy array
data_as_numpy = np.asarray(input_data)

# Reshaping the data as we are predicting for a categorial data
reshaped_data = data_as_numpy.reshape(1,-1)

prediction = model.predict(reshaped_data)
print(prediction)

if prediction[0] ==0:
    print('The patient does not have a heart diesease 🟢')
else:
    print('The patient has heart diesease 🔴')
    
# These 🟢 & 🔴 are emojis I copied from web

[1]
The patient has heart diesease 🔴


In [75]:
# Using a differentr set of data

input_data = (54,1,0,124,266,0,0,109,1,2.2,1,1,3)

data_as_numpy = np.asarray(input_data)


reshaped_data = data_as_numpy.reshape(1,-1)

prediction = model.predict(reshaped_data)
print(prediction)

if prediction[0] ==0:
    print('The patient does not have a heart diesease 🟢')
else:
    print('The patient has heart diesease 🔴')

[0]
The patient does not have a heart diesease 🟢
