In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

import seaborn as sb
sb.set(style='white', color_codes=True)
sb.set(font_scale=1.5)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('C:\\Users\\hp\\Documents\\DATA SCIENCE\\data_sets\\Heart_Disease_Prediction.csv')

In [3]:
data.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [4]:
data.isna().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [5]:
data.dtypes

Age                          int64
Sex                          int64
Chest pain type              int64
BP                           int64
Cholesterol                  int64
FBS over 120                 int64
EKG results                  int64
Max HR                       int64
Exercise angina              int64
ST depression              float64
Slope of ST                  int64
Number of vessels fluro      int64
Thallium                     int64
Heart Disease               object
dtype: object

In [6]:
data['ST depression']=data['ST depression'].astype(int) 

In [7]:
data.dtypes

Age                         int64
Sex                         int64
Chest pain type             int64
BP                          int64
Cholesterol                 int64
FBS over 120                int64
EKG results                 int64
Max HR                      int64
Exercise angina             int64
ST depression               int32
Slope of ST                 int64
Number of vessels fluro     int64
Thallium                    int64
Heart Disease              object
dtype: object

In [8]:
data.drop(['Number of vessels fluro','Slope of ST','Max HR'], axis=1, inplace=True)

In [9]:
data.dtypes

Age                 int64
Sex                 int64
Chest pain type     int64
BP                  int64
Cholesterol         int64
FBS over 120        int64
EKG results         int64
Exercise angina     int64
ST depression       int32
Thallium            int64
Heart Disease      object
dtype: object

In [10]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [11]:
data['heart_disease']=le.fit_transform(data['Heart Disease'])

In [12]:
data.head(2)

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Exercise angina,ST depression,Thallium,Heart Disease,heart_disease
0,70,1,4,130,322,0,2,0,2,3,Presence,1
1,67,0,3,115,564,0,2,0,1,7,Absence,0


In [13]:
data.drop(['Heart Disease'], axis=1, inplace=True)

In [14]:
data.dtypes

Age                int64
Sex                int64
Chest pain type    int64
BP                 int64
Cholesterol        int64
FBS over 120       int64
EKG results        int64
Exercise angina    int64
ST depression      int32
Thallium           int64
heart_disease      int32
dtype: object

In [15]:
#we split the data in to featyres and target variables
features=['Age','Sex','Chest pain type','BP','Cholesterol','FBS over 120','EKG results','Exercise angina','ST depression','Thallium']
x=data[features].values
y=data.heart_disease

In [16]:
#we create the train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)

In [17]:
#we instantiate the logistic regression
logreg=LogisticRegression()

In [18]:
logreg.fit(xtrain,ytrain)

LogisticRegression()

In [19]:
y_predict=logreg.predict(xtest)

In [20]:
y_predict

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0])

In [21]:
logreg.intercept_

array([-4.8487426])

In [22]:
logreg.coef_

array([[ 0.01344731,  0.62790083,  0.48080326, -0.01207283, -0.00465368,
        -0.9032346 ,  0.83417046,  0.9798036 ,  0.37562656,  0.65691388]])

In [23]:
#compute the confusion matrix to evaluate the accuracy of classification
metrics.confusion_matrix(ytest,y_predict)

array([[36,  6],
       [13, 26]], dtype=int64)

In [24]:
metrics.accuracy_score(ytest,y_predict)   #the prediction is accurate at 77%

0.7654320987654321

In [25]:
len(xtest)

81

In [26]:
print(classification_report(ytest,y_predict))    # Among 81 patients, 42 will be negative, 39 will be positive

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        42
           1       0.81      0.67      0.73        39

    accuracy                           0.77        81
   macro avg       0.77      0.76      0.76        81
weighted avg       0.77      0.77      0.76        81



In [27]:
#we predict the probability of survival using the used features
logreg.predict_proba(xtest)

array([[0.79108516, 0.20891484],
       [0.05827332, 0.94172668],
       [0.98186829, 0.01813171],
       [0.01578347, 0.98421653],
       [0.12485499, 0.87514501],
       [0.87524578, 0.12475422],
       [0.05237414, 0.94762586],
       [0.91990315, 0.08009685],
       [0.62693205, 0.37306795],
       [0.07115765, 0.92884235],
       [0.04963948, 0.95036052],
       [0.15392446, 0.84607554],
       [0.03401289, 0.96598711],
       [0.72586205, 0.27413795],
       [0.5432297 , 0.4567703 ],
       [0.41696417, 0.58303583],
       [0.86168316, 0.13831684],
       [0.34898725, 0.65101275],
       [0.13676287, 0.86323713],
       [0.98674789, 0.01325211],
       [0.67507245, 0.32492755],
       [0.97700407, 0.02299593],
       [0.73240273, 0.26759727],
       [0.93401307, 0.06598693],
       [0.85945489, 0.14054511],
       [0.5181296 , 0.4818704 ],
       [0.1545294 , 0.8454706 ],
       [0.83507303, 0.16492697],
       [0.71256466, 0.28743534],
       [0.66296517, 0.33703483],
       [0.