In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
#Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# Training and testing sets
from sklearn.model_selection import train_test_split
#Metrics
from sklearn.metrics import mean_absolute_error,accuracy_score, f1_score, confusion_matrix

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

### Read data (csv)

In [2]:
data = pandas.read_csv('heart_disease_data.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


### Stadistics

In [3]:
data['target'].describe()

count    303.000000
mean       0.544554
std        0.498835
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: target, dtype: float64

In [4]:
# target(1: have a heart disease, 0: no heart disease)
data['target'].value_counts() #165 do have a heart disease

1    165
0    138
Name: target, dtype: int64

### Correlation 

In [5]:
for col in data:
    print(col+": \t\t", data[col].corr(data['target']))

age: 		 -0.22543871587483727
sex: 		 -0.2809365755017666
cp: 		 0.4337982615068934
trestbps: 		 -0.14493112849775147
chol: 		 -0.08523910513756904
fbs: 		 -0.028045760272712827
restecg: 		 0.13722950287377336
thalach: 		 0.42174093381067435
exang: 		 -0.4367570833533018
oldpeak: 		 -0.4306960016873684
slope: 		 0.34587707824172526
ca: 		 -0.3917239923512519
thal: 		 -0.3440292680383098
target: 		 1.0


### Train (70% train, 30% test)

In [6]:
X = data.values[:,:-1] #other columns
Y = data.values[:, -1] #target
print("*X shape: ",X.shape," *Y shape: ",Y.shape)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.30, random_state=0)
print('*Xtrain: ',Xtrain.shape,'  *Xtest: ',Ytest.shape)

*X shape:  (303, 13)  *Y shape:  (303,)
*Xtrain:  (212, 13)   *Xtest:  (91,)


### Logistic Regression
#####  -Input: continuous
##### -Output: discrete

In [7]:
model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(Xtrain,Ytrain)
Ypred = model.predict(Xtest)
print("test: ",Ytest)
print("\npred: ",Ypred)
print("-----------------------------------------------------------------------------------------------------------------------------")
print('--> Accuracy:', accuracy_score(Ytest,Ypred))
print('--> F1:', f1_score(Ytest,Ypred)) 
# Confusion Matrix
print('--> Confusion Matrix:')
m = confusion_matrix(Ytest,Ypred)
m = m.transpose()
m = np.round( (m/np.sum(m,axis=0))*100, 1).transpose() #para que salga en porcentaje
df = pandas.DataFrame(m)
df

test:  [0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.
 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.]

pred:  [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0.
 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0.]
-----------------------------------------------------------------------------------------------------------------------------
--> Accuracy: 0.8131868131868132
--> F1: 0.8316831683168316
--> Confusion Matrix:


Unnamed: 0,0,1
0,72.7,27.3
1,10.6,89.4


### Predict either a pacient has or not a heart disease

##### INPUT VALUES:
    -age
    -sex (1 = male; 0 = female)
    -chest pain type (0,1,2,3)
    -resting blood pressure in mm Hg
    -serum cholestoral in mg/dl
    -fasting blood sugar > 120 mg/dl
    -resting electrocardiographic results (values 0,1,2)
    -maximum heart rate achieved
    -exercise induced angina (1 = yes; 0 = no)
    -oldpeak = ST depression induced by exercise relative to rest
    -the slope of the peak exercise ST segment
    -number of major vessels (0-3) colored by flourosopy
    -thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
    
    # Examples:
        -patient who has a heart disease: 41 0 1 130 204 0 0 172 0 1.4 2 0 2
        -patient who hasn't a heart disease: 62 0 0 140 268 0 0 160 0 3.6 0 2 2

In [8]:
# Input data 
#lst = (41,0,1,130,204,0,0,172,0,1.4,2,0,2) #Disease
#lst = (62,0,0,140,268,0,0,160,0,3.6,0,2,2) #NO disease

# User input
lst = input('Input data: ').split()
arr = np.asarray(lst,dtype=float)
reshaped = arr.reshape(1,-1)

# Prediction
pred = model.predict(reshaped)
print("--> Logistic prediction: ",pred[0],"\n")

if(pred[0] == 0):
    print(bcolors.OKBLUE+ "\t ** The patient does not have any heart disease :D **" + bcolors.ENDC,"**")
else:
    print(bcolors.FAIL+ "\t ** The patient has a heart disease :( **" + bcolors.ENDC)


--> Logistic prediction:  0.0 

[94m	 ** The patient does not have any heart disease :D **[0m **
