Importing the dependencies

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Data collection and procesing

In [46]:
#loading the csv data as dataframe

data = pd.read_csv('/HeartDisease_encoded.csv')


In [47]:
#description of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        457 non-null    int64  
 1   Age       457 non-null    int64  
 2   Sex       457 non-null    int64  
 3   cp        457 non-null    int64  
 4   trestbps  424 non-null    float64
 5   chol      431 non-null    float64
 6   fbs       403 non-null    float64
 7   restecg   455 non-null    float64
 8   thalach   424 non-null    float64
 9   exang     424 non-null    float64
 10  oldpeak   421 non-null    float64
 11  num       457 non-null    int64  
dtypes: float64(7), int64(5)
memory usage: 43.0 KB


In [48]:
#checking null values
data.isnull().sum()

ID           0
Age          0
Sex          0
cp           0
trestbps    33
chol        26
fbs         54
restecg      2
thalach     33
exang       33
oldpeak     36
num          0
dtype: int64

In [49]:
data_1 = data.dropna()

In [50]:
data_1.isnull().sum()

ID          0
Age         0
Sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
num         0
dtype: int64

In [51]:
#statistical measures of the data
data_1.describe()

Unnamed: 0,ID,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
count,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0
mean,206.087719,49.961988,0.777778,3.157895,132.371345,223.824561,0.125731,0.327485,134.874269,0.371345,0.653216,0.435673
std,128.248171,8.976428,0.416349,0.940259,18.188959,94.576777,0.332032,0.571364,24.478586,0.483872,0.921989,0.496571
min,1.0,28.0,0.0,1.0,92.0,0.0,0.0,0.0,69.0,0.0,-0.5,0.0
25%,99.25,43.0,1.0,2.0,120.0,196.0,0.0,0.0,118.0,0.0,0.0,0.0
50%,196.5,51.0,1.0,4.0,130.0,230.0,0.0,0.0,135.5,0.0,0.0,0.0
75%,289.75,56.0,1.0,4.0,140.0,274.0,0.0,1.0,150.0,1.0,1.0,1.0
max,457.0,75.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,1.0


In [52]:
#checking the distribution of target variable
data_1['num'].value_counts()

0    193
1    149
Name: num, dtype: int64

Feature engineering

In [53]:
# m = {'Male' : 1, 'Female' : 0}
# data_1['sex'] = data_1['sex'].str[0].str.lower().map(m)

In [54]:
X = data_1.drop(columns = 'num',axis = 1)
Y = data_1['num']

In [55]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 456
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        342 non-null    int64  
 1   Age       342 non-null    int64  
 2   Sex       342 non-null    int64  
 3   cp        342 non-null    int64  
 4   trestbps  342 non-null    float64
 5   chol      342 non-null    float64
 6   fbs       342 non-null    float64
 7   restecg   342 non-null    float64
 8   thalach   342 non-null    float64
 9   exang     342 non-null    float64
 10  oldpeak   342 non-null    float64
dtypes: float64(7), int64(4)
memory usage: 32.1 KB


In [56]:
#printing target column separately
print(Y)

0      0
1      0
3      0
4      0
5      0
      ..
450    0
451    0
452    1
453    1
456    1
Name: num, Length: 342, dtype: int64


Spliting the data into test set and train set

In [57]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, stratify = Y,random_state = 2)

In [58]:
print(X.shape,X_train.shape,X_test.shape)

(342, 11) (273, 11) (69, 11)


Model Training

In [59]:
model = LogisticRegression()

In [60]:
#model traing
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Model evaluation

In [61]:
#accuracy score
X_train_prediction = model.predict(X_train)
training_data_score = accuracy_score(X_train_prediction,Y_train)
print(training_data_score)

0.8644688644688645


In [63]:
#accuracy score
X_test_prediction = model.predict(X_test)
testing_data_score = accuracy_score(X_test_prediction,Y_test)
print(testing_data_score)

0.8260869565217391


Building a Predictive System

In [65]:
input_data = (2,29,1,2,120,243,0,0,160,0,0)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[0]
The Person does not have a Heart Disease


  "X does not have valid feature names, but"
