In [3]:
import sys
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

In [2]:
# Read in the data
df = pd.read_csv('heart.csv')

df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Looking at the Dataset

In [4]:
print("Dimensions of the dataset : ", df.shape)
print("Features of the dataset :")
print(df.describe(include = 'all'))

Dimensions of the dataset :  (918, 12)
Features of the dataset :
               Age  Sex ChestPainType   RestingBP  Cholesterol   FastingBS   
count   918.000000  918           918  918.000000   918.000000  918.000000  \
unique         NaN    2             4         NaN          NaN         NaN   
top            NaN    M           ASY         NaN          NaN         NaN   
freq           NaN  725           496         NaN          NaN         NaN   
mean     53.510893  NaN           NaN  132.396514   198.799564    0.233115   
std       9.432617  NaN           NaN   18.514154   109.384145    0.423046   
min      28.000000  NaN           NaN    0.000000     0.000000    0.000000   
25%      47.000000  NaN           NaN  120.000000   173.250000    0.000000   
50%      54.000000  NaN           NaN  130.000000   223.000000    0.000000   
75%      60.000000  NaN           NaN  140.000000   267.000000    0.000000   
max      77.000000  NaN           NaN  200.000000   603.000000    1.000000   

### 1. Train and Test Linear Regression model

Preprocessing

In [6]:
# drop non-numeric values
X = df.drop(['HeartDisease', 'Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis = 1)
y = df['HeartDisease']

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)
print("Pre-processed data :")
print(X)

categories = [['0', '1']]
encoder = OneHotEncoder(categories=categories, sparse=False)
y = encoder.fit_transform(y.values.reshape(-1,1))
print("Pre-processed class :")
print(y)

Pre-processed data :
          Age  RestingBP  Cholesterol  FastingBS     MaxHR   Oldpeak
0    0.244898       0.70     0.479270        0.0  0.788732  0.295455
1    0.428571       0.80     0.298507        0.0  0.676056  0.409091
2    0.183673       0.65     0.469320        0.0  0.267606  0.295455
3    0.408163       0.69     0.354892        0.0  0.338028  0.465909
4    0.530612       0.75     0.323383        0.0  0.436620  0.295455
..        ...        ...          ...        ...       ...       ...
913  0.346939       0.55     0.437811        0.0  0.507042  0.431818
914  0.816327       0.72     0.320066        1.0  0.570423  0.681818
915  0.591837       0.65     0.217247        0.0  0.387324  0.431818
916  0.591837       0.65     0.391376        0.0  0.802817  0.295455
917  0.204082       0.69     0.290216        0.0  0.795775  0.295455

[918 rows x 6 columns]
Pre-processed class :
[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


