### Imports

In [296]:
import pandas as pd
import numpy as np
from scipy import stats as spstats

from sklearn import model_selection as skms
from sklearn import preprocessing as skpp
from sklearn import metrics as skmet

from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# import plotly.graph_objects as go
import plotly.express as px
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline


### Load csv data into pandas dataframe
#### We want our actual feature data to drop the output column

In [265]:
heart_data = pd.read_csv('heart.csv')
heart_feature = heart_data.drop('output', axis = 1)

### Retrieve correlation between columns

In [266]:
heart_data.corr(method = 'pearson')

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
age,1.0,-0.098447,-0.068653,0.279351,0.213678,0.121308,-0.116211,-0.398522,0.096801,0.210013,-0.168814,0.276326,0.068001,-0.225439
sex,-0.098447,1.0,-0.049353,-0.056769,-0.197912,0.045032,-0.058196,-0.04402,0.141664,0.096093,-0.030711,0.118261,0.210041,-0.280937
cp,-0.068653,-0.049353,1.0,0.047608,-0.076904,0.094444,0.044421,0.295762,-0.39428,-0.14923,0.119717,-0.181053,-0.161736,0.433798
trtbps,0.279351,-0.056769,0.047608,1.0,0.123174,0.177531,-0.114103,-0.046698,0.067616,0.193216,-0.121475,0.101389,0.06221,-0.144931
chol,0.213678,-0.197912,-0.076904,0.123174,1.0,0.013294,-0.15104,-0.00994,0.067023,0.053952,-0.004038,0.070511,0.098803,-0.085239
fbs,0.121308,0.045032,0.094444,0.177531,0.013294,1.0,-0.084189,-0.008567,0.025665,0.005747,-0.059894,0.137979,-0.032019,-0.028046
restecg,-0.116211,-0.058196,0.044421,-0.114103,-0.15104,-0.084189,1.0,0.044123,-0.070733,-0.05877,0.093045,-0.072042,-0.011981,0.13723
thalachh,-0.398522,-0.04402,0.295762,-0.046698,-0.00994,-0.008567,0.044123,1.0,-0.378812,-0.344187,0.386784,-0.213177,-0.096439,0.421741
exng,0.096801,0.141664,-0.39428,0.067616,0.067023,0.025665,-0.070733,-0.378812,1.0,0.288223,-0.257748,0.115739,0.206754,-0.436757
oldpeak,0.210013,0.096093,-0.14923,0.193216,0.053952,0.005747,-0.05877,-0.344187,0.288223,1.0,-0.577537,0.222682,0.210244,-0.430696


### Since it's hard to understand what some of these features are, we add descriptions to each feature
1. age - age in years

2. sex - sex (1 = male; 0 = female)

3. cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 0 = asymptomatic)

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

7. restecg - resting electrocardiographic results (1 = normal; 2 = having ST-T wave abnormality; 0 = hypertrophy)

8. thalach - maximum heart rate achieved

9. exang - exercise induced angina (1 = yes; 0 = no)

10. oldpeak - ST depression induced by exercise relative to rest

11. slope - the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)

12. ca - number of major vessels (0-3) colored by flourosopy

13. thal - 2 = normal; 1 = fixed defect; 3 = reversable defect

14. num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < diameter narrowing; Value 1 = > 50% diameter narrowing) 

### Perform a description of the data

In [267]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [268]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


### The data looks clean (no nulls and no object as the dtype)
So we plot the outputs

In [269]:
px.histogram(heart_data, x = 'output', color = 'output', template = 'plotly_dark')

In [270]:
px.histogram(heart_data, x = 'age', color = 'output', template = 'plotly_dark')

In [271]:
px.histogram(heart_data, x = 'chol', color = 'output', template = 'plotly_dark')

### Heatmap of the correlation (We do not necessarily perform feature filtering based on correlation)

In [272]:
# FIGURE OUT HOW TO DO THIS IN PLOTLY BECAUSE PLOTLY IS BEST YESSIR!
px.imshow(abs(heart_data.corr()))


### Box plot the data

In [273]:
for key in heart_feature.columns:
    fig = px.box(heart_feature, y = key)
    fig.show()

### We can also specifically find which features have outliers
We show the IQR for each feature  
And find the outliers using IQR
  

Some of these are unreasonable, as the initial data is boolean-like, so naturally there really are no outliers

In [274]:
for key in heart_feature.columns:
    fq, tq = np.quantile(heart_feature[key], [0.25, 0.75])
    iqr = tq - fq
    outliers = heart_feature[(heart_feature[key] < fq - 1.5 * iqr) | (heart_feature[key] > tq + 1.5 * iqr)]
    if (not outliers.empty):
        print(f'{key}:')
        display(outliers)


trtbps:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
110,64,0,0,180,325,0,1,154,1,0.0,2,0,2
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
223,56,0,0,200,288,1,0,133,1,4.0,0,2,3
241,59,0,0,174,249,0,1,143,1,0.0,1,0,2
248,54,1,1,192,283,0,0,195,0,0.0,2,1,3
260,66,0,0,178,228,1,1,165,1,1.0,1,2,3
266,55,0,0,180,327,0,2,117,1,3.4,1,0,2


chol:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3
96,62,0,0,140,394,0,0,157,0,1.2,1,0,2
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3


fbs:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3
14,58,0,3,150,283,1,0,162,0,1.0,2,0,2
23,61,1,2,150,243,1,1,137,1,1.0,1,0,2
26,59,1,2,150,212,1,1,157,0,1.6,2,0,2
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
29,53,1,2,130,197,1,0,152,0,1.2,0,0,2
36,54,0,2,135,304,1,1,170,0,0.0,2,0,2
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
64,58,1,2,140,211,1,0,165,0,0.0,2,0,2


thalachh:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2


oldpeak:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
204,62,0,0,160,164,0,0,145,0,6.2,0,3,3
221,55,1,0,140,217,0,1,111,1,5.6,0,0,3
250,51,1,0,140,298,0,1,122,1,4.2,1,3,3
291,58,1,0,114,318,0,2,140,0,4.4,0,3,1


caa:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
52,62,1,2,130,231,0,1,146,0,1.8,1,3,3
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2
97,52,1,0,108,233,1,1,147,0,0.1,2,3,3
99,53,1,2,130,246,1,0,173,0,0.0,2,3,2
158,58,1,1,125,220,0,1,144,0,0.4,1,4,3
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2
165,67,1,0,160,286,0,0,108,1,1.5,1,3,2
181,65,0,0,150,225,0,0,114,0,1.0,1,3,3
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3


thall:


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
48,53,0,2,128,216,0,0,115,0,0.0,2,0,0
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0


We follow the IQR findings by using the z-score method

In [275]:
for key in heart_feature.columns:
    outliers = heart_feature[spstats.zscore(heart_feature[key]) > 3]
    if (not outliers.empty):
        print(f'{key}:\n')
        display(outliers)

trtbps:



Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
223,56,0,0,200,288,1,0,133,1,4.0,0,2,3
248,54,1,1,192,283,0,0,195,0,0.0,2,1,3


chol:



Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3


oldpeak:



Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
204,62,0,0,160,164,0,0,145,0,6.2,0,3,3
221,55,1,0,140,217,0,1,111,1,5.6,0,0,3


caa:



Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2
158,58,1,1,125,220,0,1,144,0,0.4,1,4,3
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2
251,43,1,0,132,247,1,0,143,1,0.1,1,4,3


### Since we don't really have a normal distribution, standard scaler does not seem to be the right choice.
Instead we will use the minmaxscaler, or the maxabsscaler

In [276]:
# Create training, validation, and testing datasets
split_ratios = {
    'train': 0.70,
    'validation': 0.2,
    'test': 0.1
}

train, test = skms.train_test_split(heart_data, train_size = split_ratios['train'])
train = train.reset_index(drop = True)
val, test = skms.train_test_split(test, train_size = split_ratios['validation'] / (1 - split_ratios['train']))
val = val.reset_index(drop = True)
test = test.reset_index(drop = True)

In [277]:
display(train)
display(val)
display(test)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,60,1,0,117,230,1,1,160,1,1.4,2,2,3,0
1,41,1,0,110,172,0,0,158,0,0.0,2,0,3,0
2,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
3,66,1,0,160,228,0,0,138,0,2.3,2,0,1,1
4,66,0,3,150,226,0,1,114,0,2.6,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,62,0,0,160,164,0,0,145,0,6.2,0,3,3,0
208,39,1,0,118,219,0,1,140,0,1.2,1,0,3,0
209,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
210,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,41,1,1,135,203,0,1,132,0,0.0,1,0,1,1
1,62,1,2,130,231,0,1,146,0,1.8,1,3,3,1
2,39,0,2,94,199,0,1,179,0,0.0,2,0,2,1
3,54,1,2,150,232,0,0,165,0,1.6,2,0,3,1
4,58,0,0,170,225,1,0,146,1,2.8,1,2,1,0
5,53,0,0,130,264,0,0,143,0,0.4,1,0,2,1
6,67,0,0,106,223,0,1,142,0,0.3,2,2,2,1
7,48,1,1,110,229,0,1,168,0,1.0,0,0,3,0
8,66,1,0,120,302,0,0,151,0,0.4,1,0,2,1
9,50,1,0,144,200,0,0,126,1,0.9,1,0,3,0


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,42,0,2,120,209,0,1,173,0,0.0,1,0,2,1
1,67,0,2,115,564,0,0,160,0,1.6,1,0,3,1
2,57,1,0,150,276,0,0,112,1,0.6,1,1,1,0
3,65,1,0,120,177,0,1,140,0,0.4,2,0,3,1
4,67,1,0,160,286,0,0,108,1,1.5,1,3,2,0
5,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
6,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
7,38,1,3,120,231,0,1,182,1,3.8,1,0,3,0
8,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
9,69,1,2,140,254,0,0,146,0,2.0,1,3,3,0


In [278]:
# Run minmaxscaler now
mmscaler = skpp.MinMaxScaler()
train= pd.DataFrame(mmscaler.fit_transform(train), columns = train.columns)
val = pd.DataFrame(mmscaler.transform(val), columns = val.columns)
test = pd.DataFrame(mmscaler.transform(test), columns = test.columns)

display(train)
display(val)
display(test)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.604651,1.0,0.000000,0.216981,0.367491,1.0,0.5,0.717742,1.0,0.225806,1.0,0.50,1.000000,0.0
1,0.162791,1.0,0.000000,0.150943,0.162544,0.0,0.0,0.701613,0.0,0.000000,1.0,0.00,1.000000,0.0
2,0.627907,1.0,1.000000,0.377358,0.381625,0.0,0.5,0.596774,0.0,0.419355,0.5,0.50,0.666667,0.0
3,0.744186,1.0,0.000000,0.622642,0.360424,0.0,0.0,0.540323,0.0,0.370968,1.0,0.00,0.333333,1.0
4,0.744186,0.0,1.000000,0.528302,0.353357,0.0,0.5,0.346774,0.0,0.419355,0.0,0.00,0.666667,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,0.651163,0.0,0.000000,0.622642,0.134276,0.0,0.0,0.596774,0.0,1.000000,0.0,0.75,1.000000,0.0
208,0.116279,1.0,0.000000,0.226415,0.328622,0.0,0.5,0.556452,0.0,0.193548,0.5,0.00,1.000000,0.0
209,0.000000,0.0,0.333333,0.226415,0.296820,0.0,0.5,0.975806,0.0,0.112903,1.0,0.00,0.666667,1.0
210,0.837209,1.0,0.000000,0.481132,0.169611,0.0,0.5,0.435484,1.0,0.419355,0.0,0.00,1.000000,0.0


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.162791,1.0,0.333333,0.386792,0.272085,0.0,0.5,0.491935,0.0,0.0,0.5,0.0,0.333333,1.0
1,0.651163,1.0,0.666667,0.339623,0.371025,0.0,0.5,0.604839,0.0,0.290323,0.5,0.75,1.0,1.0
2,0.116279,0.0,0.666667,0.0,0.257951,0.0,0.5,0.870968,0.0,0.0,1.0,0.0,0.666667,1.0
3,0.465116,1.0,0.666667,0.528302,0.374558,0.0,0.0,0.758065,0.0,0.258065,1.0,0.0,1.0,1.0
4,0.55814,0.0,0.0,0.716981,0.349823,1.0,0.0,0.604839,1.0,0.451613,0.5,0.5,0.333333,0.0
5,0.44186,0.0,0.0,0.339623,0.487633,0.0,0.0,0.580645,0.0,0.064516,0.5,0.0,0.666667,1.0
6,0.767442,0.0,0.0,0.113208,0.342756,0.0,0.5,0.572581,0.0,0.048387,1.0,0.5,0.666667,1.0
7,0.325581,1.0,0.333333,0.150943,0.363958,0.0,0.5,0.782258,0.0,0.16129,0.0,0.0,1.0,0.0
8,0.744186,1.0,0.0,0.245283,0.621908,0.0,0.0,0.645161,0.0,0.064516,0.5,0.0,0.666667,1.0
9,0.372093,1.0,0.0,0.471698,0.261484,0.0,0.0,0.443548,1.0,0.145161,0.5,0.0,1.0,0.0


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0.186047,0.0,0.666667,0.245283,0.293286,0.0,0.5,0.822581,0.0,0.0,0.5,0.0,0.666667,1.0
1,0.767442,0.0,0.666667,0.198113,1.547703,0.0,0.0,0.717742,0.0,0.258065,0.5,0.0,1.0,1.0
2,0.534884,1.0,0.0,0.528302,0.530035,0.0,0.0,0.330645,1.0,0.096774,0.5,0.25,0.333333,0.0
3,0.72093,1.0,0.0,0.245283,0.180212,0.0,0.5,0.556452,0.0,0.064516,1.0,0.0,1.0,1.0
4,0.767442,1.0,0.0,0.622642,0.565371,0.0,0.0,0.298387,1.0,0.241935,0.5,0.75,0.666667,0.0
5,0.488372,1.0,0.0,0.622642,0.575972,0.0,0.0,0.596774,1.0,0.129032,0.5,0.25,1.0,0.0
6,0.162791,0.0,0.333333,0.339623,0.275618,0.0,0.0,0.814516,0.0,0.225806,1.0,0.0,0.666667,1.0
7,0.093023,1.0,1.0,0.245283,0.371025,0.0,0.5,0.895161,1.0,0.612903,0.5,0.0,1.0,0.0
8,0.534884,0.0,0.0,0.433962,0.40636,0.0,0.5,0.419355,1.0,0.032258,0.5,0.0,1.0,0.0
9,0.813953,1.0,0.666667,0.433962,0.452297,0.0,0.0,0.604839,0.0,0.322581,0.5,0.75,1.0,0.0


In [279]:
# Split data into x and y (input and output)
x_train = train.iloc[:, :-1]
y_train = train['output']
display(x_train)
display(y_train)

x_val = val.iloc[:, :-1]
y_val = val['output']

x_test = test.iloc[:, :-1]
y_test = test['output']

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,0.604651,1.0,0.000000,0.216981,0.367491,1.0,0.5,0.717742,1.0,0.225806,1.0,0.50,1.000000
1,0.162791,1.0,0.000000,0.150943,0.162544,0.0,0.0,0.701613,0.0,0.000000,1.0,0.00,1.000000
2,0.627907,1.0,1.000000,0.377358,0.381625,0.0,0.5,0.596774,0.0,0.419355,0.5,0.50,0.666667
3,0.744186,1.0,0.000000,0.622642,0.360424,0.0,0.0,0.540323,0.0,0.370968,1.0,0.00,0.333333
4,0.744186,0.0,1.000000,0.528302,0.353357,0.0,0.5,0.346774,0.0,0.419355,0.0,0.00,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,0.651163,0.0,0.000000,0.622642,0.134276,0.0,0.0,0.596774,0.0,1.000000,0.0,0.75,1.000000
208,0.116279,1.0,0.000000,0.226415,0.328622,0.0,0.5,0.556452,0.0,0.193548,0.5,0.00,1.000000
209,0.000000,0.0,0.333333,0.226415,0.296820,0.0,0.5,0.975806,0.0,0.112903,1.0,0.00,0.666667
210,0.837209,1.0,0.000000,0.481132,0.169611,0.0,0.5,0.435484,1.0,0.419355,0.0,0.00,1.000000


0      0.0
1      0.0
2      0.0
3      1.0
4      1.0
      ... 
207    0.0
208    0.0
209    1.0
210    0.0
211    0.0
Name: output, Length: 212, dtype: float64

### Using LinearSVC model

In [295]:
linsvc_model = LinearSVC()
linsvc_model.fit(x_train, y_train)
linsvc_predict = linsvc_model.predict(x_test)
print(skmet.accuracy_score(linsvc_predict, y_test))

0.7741935483870968


### Using KNeighborsClassifier model

In [314]:
knc_model = KNeighborsClassifier(n_neighbors = 6)
knc_model.fit(x_train, y_train)
knc_predict = knc_model.predict(x_test)
print(skmet.accuracy_score(knc_predict, y_test))


0.7419354838709677
