# SETUP

In [1]:
# import libraries

import pandas as pd
import numpy as np
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
# Import AutoGluon Libraries
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.impute import SimpleImputer

In [2]:
# Install AutoGluon
!pip install autogluon.tabular



In [3]:
# load Cleveland datafile
cleveland_csv_path = "processed.cleveland.data"
cleveland_data = pd.read_csv(cleveland_csv_path, header = None)
# assign column names
cleveland_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

In [4]:
# Define label column
label = 'num'
heart_label = 'num'

# SIMPLEST OF MODELS
zero data preprocessing performed

In [5]:
cleveland_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [6]:
cleveland_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


## BINARY MODELING
(Note: "Cleveland" label here would need to be revised because used in Slightly Better Model below when split the data, if care.)

### PREPARE DATA

In [7]:
# make copy for binary labeling
cleveland_binary = cleveland_data.copy()
cleveland_binary['num'][cleveland_binary['num'] > 0] = 1
cleveland_binary.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleveland_binary['num'][cleveland_binary['num'] > 0] = 1


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [8]:
# split the binary data
from sklearn.model_selection import train_test_split
cleveland_train, cleveland_test = train_test_split(cleveland_binary, test_size=0.2, random_state=42)
cleveland_test_labels = cleveland_test[heart_label]
cleveland_test_nolabel = cleveland_test.drop([heart_label], axis=1)
cleveland_train_labels = cleveland_train[heart_label]
cleveland_train_nolabel = cleveland_train.drop([heart_label], axis=1)
cleveland_train.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
100,45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,0
68,59.0,1.0,4.0,170.0,326.0,0.0,2.0,140.0,1.0,3.4,3.0,0.0,7.0,1
108,61.0,1.0,4.0,120.0,260.0,0.0,0.0,140.0,1.0,3.6,2.0,1.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
229,66.0,1.0,4.0,112.0,212.0,0.0,2.0,132.0,1.0,0.1,1.0,1.0,3.0,1


### MODEL - TRAIN & EVALUATE

In [9]:
# Create an instance and then fit the train data
cle_TabPre = TabularPredictor(label=heart_label, path='.').fit(cleveland_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    242
Train Data Columns: 13
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    313.57 MB
	Train Data (Original)  Memory Usage: 0.05 MB (0.0% of available memory)
	Infe

In [10]:
# Check the performance using test data
cle_pred = cle_TabPre.predict(cleveland_test_nolabel)
cle_perf = cle_TabPre.evaluate_predictions(y_true=cleveland_test_labels, y_pred=cle_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8852459016393442
Evaluations on test data:
{
    "accuracy": 0.8852459016393442,
    "balanced_accuracy": 0.8873922413793103,
    "mcc": 0.7747844827586207,
    "f1": 0.8852459016393444,
    "precision": 0.9310344827586207,
    "recall": 0.84375
}


In [11]:
cle_perf2 = cle_TabPre.evaluate(data=cleveland_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.8852459016393442
Evaluations on test data:
{
    "accuracy": 0.8852459016393442,
    "balanced_accuracy": 0.8873922413793103,
    "mcc": 0.7747844827586207,
    "roc_auc": 0.9461206896551724,
    "f1": 0.8852459016393444,
    "precision": 0.9310344827586207,
    "recall": 0.84375
}


In [12]:
cle_perf3 = cle_TabPre.evaluate(data=cleveland_test, auxiliary_metrics=True, model='ExtraTreesGini')

Evaluation: accuracy on test data: 0.8852459016393442
Evaluations on test data:
{
    "accuracy": 0.8852459016393442,
    "balanced_accuracy": 0.8873922413793103,
    "mcc": 0.7747844827586207,
    "roc_auc": 0.9461206896551724,
    "f1": 0.8852459016393444,
    "precision": 0.9310344827586207,
    "recall": 0.84375
}


In [13]:
cle_perf4 = cle_TabPre.evaluate(data=cleveland_test, auxiliary_metrics=True, model='ExtraTreesEntr')

Evaluation: accuracy on test data: 0.8852459016393442
Evaluations on test data:
{
    "accuracy": 0.8852459016393442,
    "balanced_accuracy": 0.8873922413793103,
    "mcc": 0.7747844827586207,
    "roc_auc": 0.9558189655172414,
    "f1": 0.8852459016393444,
    "precision": 0.9310344827586207,
    "recall": 0.84375
}


In [14]:
#Show the leaderboard
cle_TabPre.leaderboard(cleveland_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini,0.885246,0.877551,0.027582,0.021327,0.25584,0.027582,0.021327,0.25584,1,True,5
1,ExtraTreesEntr,0.885246,0.836735,0.028082,0.021282,0.254199,0.028082,0.021282,0.254199,1,True,6
2,WeightedEnsemble_L2,0.885246,0.897959,0.061902,0.046186,0.642596,0.00153,0.001469,0.091696,2,True,8
3,RandomForestEntr,0.885246,0.857143,0.12538,0.020789,0.255968,0.12538,0.020789,0.255968,1,True,4
4,RandomForestGini,0.868852,0.857143,0.03279,0.02339,0.29506,0.03279,0.02339,0.29506,1,True,3
5,XGBoost,0.836066,0.836735,0.007129,0.003149,0.084084,0.007129,0.003149,0.084084,1,True,7
6,KNeighborsUnif,0.672131,0.673469,0.006293,0.002785,2.016339,0.006293,0.002785,2.016339,1,True,1
7,KNeighborsDist,0.655738,0.653061,0.004014,0.001086,0.004252,0.004014,0.001086,0.004252,1,True,2


In [15]:
#Show the best model
cle_TabPre.get_model_best()

'WeightedEnsemble_L2'

## MULTICLASS MODELING

In [16]:
cleveland_binary.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [17]:
cleveland_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [18]:
# split the multiclass data
from sklearn.model_selection import train_test_split
mccleveland_train, mccleveland_test = train_test_split(cleveland_data, test_size=0.2, random_state=42)
mccleveland_test_labels = mccleveland_test[heart_label]
mccleveland_test_nolabel = mccleveland_test.drop([heart_label], axis=1)
mccleveland_train_labels = mccleveland_train[heart_label]
mccleveland_train_nolabel = mccleveland_train.drop([heart_label], axis=1)
mccleveland_train.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
142,52.0,1.0,2.0,128.0,205.0,1.0,0.0,184.0,0.0,0.0,1.0,0.0,3.0,0
230,52.0,0.0,3.0,136.0,196.0,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0,0
164,48.0,1.0,3.0,124.0,255.0,1.0,0.0,175.0,0.0,0.0,1.0,2.0,3.0,0
23,58.0,1.0,3.0,132.0,224.0,0.0,2.0,173.0,0.0,3.2,1.0,2.0,7.0,3
247,47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1


### MODEL - TRAIN & EVALUATE

In [19]:
# Create an instance and then fit the train data
mccle_TabPre = TabularPredictor(label=heart_label, path='.').fit(mccleveland_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    242
Train Data Columns: 13
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	5 unique label values:  [0, 3, 1, 2, 4]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 5
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    327.68 MB
	Train Data (Original)  Memory Usage: 0.05 MB (0.0% of available me

In [20]:
# Check the performance using test data
mccle_pred = mccle_TabPre.predict(mccleveland_test_nolabel)
mccle_perf = mccle_TabPre.evaluate_predictions(y_true=mccleveland_test_labels, y_pred=mccle_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.5081967213114754
Evaluations on test data:
{
    "accuracy": 0.5081967213114754,
    "balanced_accuracy": 0.24865900383141773,
    "mcc": 0.23660814133937144
}


In [21]:
#Show the leaderboard
mccle_TabPre.leaderboard(mccleveland_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini,0.508197,0.653061,0.0294,0.021166,0.260073,0.0294,0.021166,0.260073,1,True,5
1,WeightedEnsemble_L2,0.508197,0.653061,0.030734,0.021374,0.339908,0.001334,0.000208,0.079835,2,True,8
2,RandomForestGini,0.508197,0.612245,0.052223,0.021236,0.287482,0.052223,0.021236,0.287482,1,True,3
3,ExtraTreesEntr,0.508197,0.632653,0.052574,0.021403,0.265118,0.052574,0.021403,0.265118,1,True,6
4,XGBoost,0.491803,0.55102,0.007419,0.002953,0.118258,0.007419,0.002953,0.118258,1,True,7
5,RandomForestEntr,0.491803,0.612245,0.05553,0.021308,0.270204,0.05553,0.021308,0.270204,1,True,4
6,KNeighborsDist,0.47541,0.530612,0.003148,0.001197,0.005485,0.003148,0.001197,0.005485,1,True,2
7,KNeighborsUnif,0.47541,0.530612,0.007408,0.002081,0.006328,0.007408,0.002081,0.006328,1,True,1


In [22]:
#Show the best model
mccle_TabPre.get_model_best()

'WeightedEnsemble_L2'

# SLIGHTLY BETTER MODEL
no missing values & stratified sampling

## DATA PREPROCESSING

### MISSING VALUES

In [23]:
# make copy of original dataframe
cleveland = cleveland_data.copy()

In [24]:
# check for missing values
cleveland.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [25]:
# replace '?' values with NaN so can impute
cleveland['thal'].replace('?', np.NaN, inplace=True)
cleveland['thal'] = cleveland['thal'].astype(float)
cleveland['ca'].replace('?', np.NaN, inplace=True)
cleveland['ca'] = cleveland['ca'].astype(float)
cleveland.tail() # to verify didn't mess up dataframe IDs

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


In [26]:
from sklearn.impute import SimpleImputer
# impute with mode as 'thal' and 'ca' (attributes w/ missing values) are discrete
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
imputeMode.fit(cleveland) # fit - learns the data
imputed = imputeMode.transform(cleveland) # transform - imputes with chosen strategy
cleveland = pd.DataFrame(imputed, columns=cleveland.columns, index=cleveland.index) # back to pandas DataFrame
cleveland.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    float64
 12  thal      303 non-null    float64
 13  num       303 non-null    float64
dtypes: float64(14)
memory usage: 33.3 KB


  mode = stats.mode(array)


In [27]:
cleveland.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1.0
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2.0
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3.0
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1.0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,3.0,0.0


### BINARY LABEL
(multiclass for "future research")

In [28]:
cleveland['num'][cleveland['num'] > 0] = 1
cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


### STRATIFIED SAMPLING

(due to clear significance of sex)

In [29]:
cleveland_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in cleveland_strat.split(cleveland, cleveland["num"]):
  cleveland_strat_train = cleveland.loc[train_index] # cleveland_train,
  cleveland_strat_test = cleveland.loc[test_index] # cleveland_test 

In [30]:
cleveland_strat_test["num"].value_counts()/len(cleveland_strat_test)

0.0    0.540984
1.0    0.459016
Name: num, dtype: float64

In [31]:
cleveland_strat_train["num"].value_counts()/len(cleveland_strat_train)

0.0    0.541322
1.0    0.458678
Name: num, dtype: float64

Less significant for Cleveland and Statlog datasets, which have less dramatic difference in sex representation in data, but different for other three datasets. 

In [32]:
# split stratified data
cleveland_strat_train_X = cleveland_strat_train.drop([heart_label], axis=1)#cleveland_train_nolabel
cleveland_strat_train_y = cleveland_strat_train[heart_label]#cleveland_train_labels 
cleveland_strat_test_X = cleveland_strat_test.drop([heart_label], axis=1)#cleveland_test_nolabel 
cleveland_strat_test_y = cleveland_strat_test[heart_label]#cleveland_test_labels
cleveland_strat_train_X.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
209,62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0
97,60.0,0.0,4.0,150.0,258.0,0.0,2.0,157.0,0.0,2.6,2.0,2.0,7.0
38,55.0,1.0,4.0,132.0,353.0,0.0,0.0,132.0,1.0,1.2,2.0,1.0,7.0
112,52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0
245,67.0,1.0,4.0,120.0,237.0,0.0,0.0,71.0,0.0,1.0,2.0,0.0,3.0


## BINARY MODEL

In [33]:
# Create an instance and then fit the train data
ccle_TabPre = TabularPredictor(label=heart_label, path='.').fit(cleveland_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    242
Train Data Columns: 13
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    319.6 MB
	Train Data (Original)  Memory Usage: 0.03 MB (0.0% of available memory)
	I

In [34]:
# Check the performance using test data
ccle_pred = ccle_TabPre.predict(cleveland_strat_test_X)
ccle_perf = ccle_TabPre.evaluate_predictions(y_true=cleveland_strat_test_y, y_pred=ccle_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8688524590163934
Evaluations on test data:
{
    "accuracy": 0.8688524590163934,
    "balanced_accuracy": 0.876082251082251,
    "mcc": 0.7546185527373646,
    "f1": 0.8709677419354839,
    "precision": 0.7941176470588235,
    "recall": 0.9642857142857143
}


In [35]:
ccle_perf2 = ccle_TabPre.evaluate(data=cleveland_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.8688524590163934
Evaluations on test data:
{
    "accuracy": 0.8688524590163934,
    "balanced_accuracy": 0.876082251082251,
    "mcc": 0.7546185527373646,
    "roc_auc": 0.9442640692640693,
    "f1": 0.8709677419354839,
    "precision": 0.7941176470588235,
    "recall": 0.9642857142857143
}


In [36]:
ccle_perf3 = ccle_TabPre.evaluate(data=cleveland_strat_test, auxiliary_metrics=True, model='RandomForestGini')

Evaluation: accuracy on test data: 0.8688524590163934
Evaluations on test data:
{
    "accuracy": 0.8688524590163934,
    "balanced_accuracy": 0.876082251082251,
    "mcc": 0.7546185527373646,
    "roc_auc": 0.9442640692640693,
    "f1": 0.8709677419354839,
    "precision": 0.7941176470588235,
    "recall": 0.9642857142857143
}


In [37]:
ccle_perf4 = ccle_TabPre.evaluate(data=cleveland_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.8524590163934426
Evaluations on test data:
{
    "accuracy": 0.8524590163934426,
    "balanced_accuracy": 0.8582251082251082,
    "mcc": 0.7164502164502164,
    "roc_auc": 0.9458874458874459,
    "f1": 0.8524590163934426,
    "precision": 0.7878787878787878,
    "recall": 0.9285714285714286
}


In [38]:
#Show the leaderboard
ccle_TabPre.leaderboard(cleveland_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini,0.868852,0.897959,0.031337,0.019909,0.264666,0.031337,0.019909,0.264666,1,True,3
1,WeightedEnsemble_L2,0.868852,0.897959,0.033044,0.020168,0.353134,0.001707,0.000259,0.088468,2,True,8
2,RandomForestEntr,0.852459,0.897959,0.027681,0.020012,0.253528,0.027681,0.020012,0.253528,1,True,4
3,ExtraTreesGini,0.836066,0.877551,0.046257,0.020055,0.249304,0.046257,0.020055,0.249304,1,True,5
4,ExtraTreesEntr,0.819672,0.897959,0.047483,0.020231,0.250673,0.047483,0.020231,0.250673,1,True,6
5,XGBoost,0.786885,0.816327,0.008872,0.002272,0.013639,0.008872,0.002272,0.013639,1,True,7
6,KNeighborsUnif,0.655738,0.714286,0.006076,0.002227,0.007989,0.006076,0.002227,0.007989,1,True,1
7,KNeighborsDist,0.622951,0.714286,0.01536,0.002002,0.004158,0.01536,0.002002,0.004158,1,True,2


In [39]:
#Show the leaderboard
ccle_TabPre.leaderboard( silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini,0.897959,0.019909,0.264666,0.019909,0.264666,1,True,3
1,RandomForestEntr,0.897959,0.020012,0.253528,0.020012,0.253528,1,True,4
2,WeightedEnsemble_L2,0.897959,0.020168,0.353134,0.000259,0.088468,2,True,8
3,ExtraTreesEntr,0.897959,0.020231,0.250673,0.020231,0.250673,1,True,6
4,ExtraTreesGini,0.877551,0.020055,0.249304,0.020055,0.249304,1,True,5
5,XGBoost,0.816327,0.002272,0.013639,0.002272,0.013639,1,True,7
6,KNeighborsDist,0.714286,0.002002,0.004158,0.002002,0.004158,1,True,2
7,KNeighborsUnif,0.714286,0.002227,0.007989,0.002227,0.007989,1,True,1


In [40]:
#Show the best model
cle_TabPre.get_model_best()

'WeightedEnsemble_L2'

# MODEL USING ONLY CORRELATED ATTRIBUTES
(incl. no missing values & stratified sampling)

## DATA PREPROCESSING

### REMOVE UNCORRELATED ATTRIBUTES

In [41]:
# check correlation
cleveland_correlation = cleveland.corr()
cleveland_correlation['num'].sort_values()

thalach    -0.417167
fbs         0.025264
chol        0.085164
trestbps    0.150825
restecg     0.169202
age         0.223120
sex         0.276816
slope       0.339213
cp          0.414446
oldpeak     0.424510
exang       0.431894
ca          0.460033
thal        0.522057
num         1.000000
Name: num, dtype: float64

In [42]:
# eliminate columns with correlation less than abs(0.2)
drop_columns = ['fbs', 'chol', 'trestbps', 'restecg']
cleveland_downsized = cleveland.drop(columns = drop_columns)
cleveland_downsized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      303 non-null    float64
 1   sex      303 non-null    float64
 2   cp       303 non-null    float64
 3   thalach  303 non-null    float64
 4   exang    303 non-null    float64
 5   oldpeak  303 non-null    float64
 6   slope    303 non-null    float64
 7   ca       303 non-null    float64
 8   thal     303 non-null    float64
 9   num      303 non-null    float64
dtypes: float64(10)
memory usage: 23.8 KB


### STRATIFIED SPLIT OF DATA

In [43]:
cleveland_downsized_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in cleveland_downsized_strat.split(cleveland_downsized, cleveland_downsized["num"]):
  cleveland_downsized_strat_train = cleveland_downsized.loc[train_index]
  cleveland_downsized_strat_test = cleveland_downsized.loc[test_index]

In [44]:
cleveland_downsized_strat_test["num"].value_counts()/len(cleveland_downsized_strat_test)

0.0    0.540984
1.0    0.459016
Name: num, dtype: float64

In [45]:
cleveland_downsized_strat_train["num"].value_counts()/len(cleveland_downsized_strat_train)

0.0    0.541322
1.0    0.458678
Name: num, dtype: float64

In [46]:
# split stratified data
cleveland_downsized_strat_train_X = cleveland_downsized_strat_train.drop([heart_label], axis=1)
cleveland_downsized_strat_train_y = cleveland_downsized_strat_train[heart_label]
cleveland_downsized_strat_test_X = cleveland_downsized_strat_test.drop([heart_label], axis=1)
cleveland_downsized_strat_test_y = cleveland_downsized_strat_test[heart_label]
cleveland_downsized_strat_train_X.sample(5)

Unnamed: 0,age,sex,cp,thalach,exang,oldpeak,slope,ca,thal
2,67.0,1.0,4.0,129.0,1.0,2.6,2.0,2.0,7.0
35,42.0,1.0,4.0,178.0,0.0,0.0,1.0,0.0,3.0
70,65.0,0.0,3.0,148.0,0.0,0.8,1.0,0.0,3.0
170,70.0,1.0,3.0,112.0,1.0,2.9,2.0,1.0,7.0
253,51.0,0.0,3.0,157.0,0.0,0.6,1.0,0.0,3.0


## BINARY MODELING


In [47]:
# Create an instance and then fit the train data
cccle_TabPre = TabularPredictor(label=heart_label, path='.').fit(cleveland_downsized_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    242
Train Data Columns: 9
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    335.46 MB
	Train Data (Original)  Memory Usage: 0.02 MB (0.0% of available memory)
	I

In [49]:
# Check the performance using test data
cccle_pred = cccle_TabPre.predict(cleveland_downsized_strat_test_X)
cccle_perf = cccle_TabPre.evaluate_predictions(y_true=cleveland_downsized_strat_test_y, y_pred=cccle_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.7868852459016393
Evaluations on test data:
{
    "accuracy": 0.7868852459016393,
    "balanced_accuracy": 0.7895021645021645,
    "mcc": 0.5771335508376638,
    "f1": 0.7796610169491526,
    "precision": 0.7419354838709677,
    "recall": 0.8214285714285714
}


In [50]:
cccle_perf2 = cccle_TabPre.evaluate(data=cleveland_downsized_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.7868852459016393
Evaluations on test data:
{
    "accuracy": 0.7868852459016393,
    "balanced_accuracy": 0.7895021645021645,
    "mcc": 0.5771335508376638,
    "roc_auc": 0.8874458874458875,
    "f1": 0.7796610169491526,
    "precision": 0.7419354838709677,
    "recall": 0.8214285714285714
}


In [51]:
cccle_perf3 = cccle_TabPre.evaluate(data=cleveland_downsized_strat_test, auxiliary_metrics=True, model='RandomForestGini')

Evaluation: accuracy on test data: 0.8524590163934426
Evaluations on test data:
{
    "accuracy": 0.8524590163934426,
    "balanced_accuracy": 0.8555194805194806,
    "mcc": 0.7087415755146639,
    "roc_auc": 0.9377705627705628,
    "f1": 0.8474576271186439,
    "precision": 0.8064516129032258,
    "recall": 0.8928571428571429
}


In [52]:
cccle_perf4 = cccle_TabPre.evaluate(data=cleveland_downsized_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.8360655737704918
Evaluations on test data:
{
    "accuracy": 0.8360655737704918,
    "balanced_accuracy": 0.8403679653679654,
    "mcc": 0.6792672430902679,
    "roc_auc": 0.9388528138528138,
    "f1": 0.8333333333333334,
    "precision": 0.78125,
    "recall": 0.8928571428571429
}


In [53]:
#Show the leaderboard
cccle_TabPre.leaderboard(cleveland_downsized_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr,0.868852,0.857143,0.061676,0.020115,0.254714,0.061676,0.020115,0.254714,1,True,6
1,RandomForestGini,0.852459,0.897959,0.041253,0.019756,0.267854,0.041253,0.019756,0.267854,1,True,3
2,ExtraTreesGini,0.852459,0.857143,0.058943,0.020543,0.33505,0.058943,0.020543,0.33505,1,True,5
3,RandomForestEntr,0.836066,0.897959,0.040387,0.024979,0.288438,0.040387,0.024979,0.288438,1,True,4
4,WeightedEnsemble_L2,0.786885,0.938776,0.056059,0.023411,0.374652,0.002884,0.000264,0.087513,2,True,8
5,XGBoost,0.737705,0.836735,0.010364,0.002206,0.015928,0.010364,0.002206,0.015928,1,True,7
6,KNeighborsDist,0.672131,0.836735,0.004857,0.001393,0.004595,0.004857,0.001393,0.004595,1,True,2
7,KNeighborsUnif,0.655738,0.795918,0.007065,0.001998,0.01469,0.007065,0.001998,0.01469,1,True,1


In [54]:
cccle_TabPre.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.938776,0.023411,0.374652,0.000264,0.087513,2,True,8
1,RandomForestGini,0.897959,0.019756,0.267854,0.019756,0.267854,1,True,3
2,RandomForestEntr,0.897959,0.024979,0.288438,0.024979,0.288438,1,True,4
3,ExtraTreesEntr,0.857143,0.020115,0.254714,0.020115,0.254714,1,True,6
4,ExtraTreesGini,0.857143,0.020543,0.33505,0.020543,0.33505,1,True,5
5,KNeighborsDist,0.836735,0.001393,0.004595,0.001393,0.004595,1,True,2
6,XGBoost,0.836735,0.002206,0.015928,0.002206,0.015928,1,True,7
7,KNeighborsUnif,0.795918,0.001998,0.01469,0.001998,0.01469,1,True,1


In [55]:
#Show the best model
cccle_TabPre.get_model_best()

'WeightedEnsemble_L2'

# Hungarian dataset

In [56]:
# load Hungarian datafile
hungarian_csv_path = "reprocessed.hungarian.data"
hungarian_data = pd.read_csv(hungarian_csv_path, delimiter = ' ', header = None)
# assign column names
hungarian_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

In [57]:
# Define label column
label = 'num'
heart_label = 'num'

# SIMPLEST OF MODELS
zero data preprocessing performed

In [58]:
hungarian_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


In [59]:
hungarian_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    float64
 1   sex       294 non-null    float64
 2   cp        294 non-null    float64
 3   trestbps  294 non-null    float64
 4   chol      294 non-null    float64
 5   fbs       294 non-null    float64
 6   restecg   294 non-null    float64
 7   thalach   294 non-null    float64
 8   exang     294 non-null    float64
 9   oldpeak   294 non-null    float64
 10  slope     294 non-null    float64
 11  ca        294 non-null    float64
 12  thal      294 non-null    float64
 13  num       294 non-null    float64
dtypes: float64(14)
memory usage: 32.4 KB


## BINARY MODELING
(Note: "Cleveland" label here would need to be revised because used in Slightly Better Model below when split the data, if care.)

### PREPARE DATA

In [60]:
# make copy of original dataframe
hungarian_basic = hungarian_data.copy()
# convert unknowns (-9.0) to NaN
hungarian_basic.replace(-9.0, np.NaN, inplace=True)
# make copy for binary labeling
hungarian_basic['num'][hungarian_basic['num'] > 0] = 1
hungarian_basic.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,,,,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,,,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,,,,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,,,1.0
4,54.0,1.0,3.0,150.0,,0.0,0.0,122.0,0.0,0.0,,,,0.0


In [61]:
# try dropping slope, ca, and thal
hungarian_basic.drop(['slope', 'ca', 'thal', ], axis=1, inplace=True) # did not fix error

In [62]:
# split the binary data
from sklearn.model_selection import train_test_split
hungarian_basic_train, hungarian_basic_test = train_test_split(hungarian_basic, test_size=0.2, random_state=42)
hungarian_basic_test_labels = hungarian_basic_test[heart_label]
hungarian_basic_test_nolabel = hungarian_basic_test.drop([heart_label], axis=1)
hungarian_basic_train_labels = hungarian_basic_train[heart_label]
hungarian_basic_train_nolabel = hungarian_basic_train.drop([heart_label], axis=1)
hungarian_basic_train.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
194,45.0,1.0,3.0,135.0,,0.0,0.0,110.0,0.0,0.0,0.0
57,58.0,1.0,3.0,130.0,213.0,0.0,1.0,140.0,0.0,0.0,1.0
183,52.0,1.0,4.0,140.0,404.0,0.0,0.0,124.0,1.0,2.0,1.0
68,52.0,1.0,4.0,160.0,246.0,0.0,1.0,82.0,1.0,4.0,1.0
205,56.0,1.0,2.0,130.0,184.0,0.0,0.0,100.0,0.0,0.0,0.0


### MODEL - TRAIN & EVALUATE

In [63]:
#hungarian_basic_train.to_csv('hungarian.csv')

In [64]:
#print(hungarian_basic_train.loc[[294]])

In [65]:
hungarian_basic_train = hungarian_basic_train.drop([294]) 

In [66]:
# Create an instance and then fit the train data
hun_TabPre = TabularPredictor(label=heart_label, path='.').fit(hungarian_basic_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    235
Train Data Columns: 10
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    337.19 MB
	Train Data (Original)  Memory Usage: 0.02 MB (0.0% of available memory)
	

In [67]:
# Check the performance using test data
hun_pred = hun_TabPre.predict(hungarian_basic_test_nolabel)
hun_perf = hun_TabPre.evaluate_predictions(y_true=hungarian_basic_test_labels, y_pred=hun_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.7796610169491526
Evaluations on test data:
{
    "accuracy": 0.7796610169491526,
    "balanced_accuracy": 0.7622377622377623,
    "mcc": 0.5572653931830464,
    "f1": 0.7111111111111111,
    "precision": 0.8421052631578947,
    "recall": 0.6153846153846154
}


In [68]:
#Show the leaderboard
hun_TabPre.leaderboard(hungarian_basic_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini,0.79661,0.851064,0.046586,0.020034,0.263037,0.046586,0.020034,0.263037,1,True,3
1,RandomForestEntr,0.79661,0.851064,0.059125,0.020119,0.259386,0.059125,0.020119,0.259386,1,True,4
2,ExtraTreesGini,0.779661,0.87234,0.02852,0.019805,0.255893,0.02852,0.019805,0.255893,1,True,5
3,WeightedEnsemble_L2,0.779661,0.87234,0.03038,0.020058,0.344331,0.00186,0.000253,0.088438,2,True,8
4,ExtraTreesEntr,0.779661,0.851064,0.049789,0.019869,0.250142,0.049789,0.019869,0.250142,1,True,6
5,XGBoost,0.762712,0.851064,0.027594,0.002241,0.017864,0.027594,0.002241,0.017864,1,True,7
6,KNeighborsDist,0.677966,0.659574,0.002887,0.0011,0.00432,0.002887,0.0011,0.00432,1,True,2
7,KNeighborsUnif,0.677966,0.659574,0.007464,0.002212,0.00904,0.007464,0.002212,0.00904,1,True,1


In [69]:
#Show the best model
hun_TabPre.get_model_best()

'WeightedEnsemble_L2'

## SLIGHTLY BETTER MODEL
no missing values & stratified sampling

### DATA PREPROCESSING

#### MISSING VALUES

In [70]:
# make copy of original dataframe
hungarian = hungarian_data.copy()

In [71]:
# convert unknowns (-9.0) to NaN
hungarian.replace(-9.0, np.NaN, inplace=True)

In [72]:
# check for missing values
hungarian.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    float64
 1   sex       294 non-null    float64
 2   cp        294 non-null    float64
 3   trestbps  293 non-null    float64
 4   chol      271 non-null    float64
 5   fbs       286 non-null    float64
 6   restecg   293 non-null    float64
 7   thalach   293 non-null    float64
 8   exang     293 non-null    float64
 9   oldpeak   294 non-null    float64
 10  slope     104 non-null    float64
 11  ca        4 non-null      float64
 12  thal      28 non-null     float64
 13  num       294 non-null    float64
dtypes: float64(14)
memory usage: 32.4 KB


In [73]:
print(hungarian.loc[[294]])

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
294  NaN  NaN NaN       NaN   NaN  NaN      NaN      NaN    NaN      NaN   

     slope  ca  thal  num  
294    NaN NaN   NaN  NaN  


In [74]:
hungarian = hungarian.drop([294]) 

In [75]:
# drop slope, ca, and thal due to >50% of values missing
hungarian.drop(['slope', 'ca', 'thal'], axis=1, inplace=True)

In [76]:
hungarian.tail() # check if next code block is needed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
289,48.0,0.0,2.0,,308.0,0.0,1.0,,,2.0,0.0
290,36.0,1.0,2.0,120.0,166.0,0.0,0.0,180.0,0.0,0.0,0.0
291,48.0,1.0,3.0,110.0,211.0,0.0,0.0,138.0,0.0,0.0,0.0
292,47.0,0.0,2.0,140.0,257.0,0.0,0.0,135.0,0.0,1.0,0.0
293,53.0,1.0,4.0,130.0,182.0,0.0,0.0,148.0,0.0,0.0,0.0


In [77]:
from sklearn.impute import SimpleImputer
# impute discrete values using mode ()
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
hungarian_disc = pd.concat([hungarian.pop(x) for x in ['fbs', 'restecg', 'exang']], axis=1) # isolate discrete
imputeMode.fit(hungarian_disc) # fit - learns the data
imputed_disc = imputeMode.transform(hungarian_disc) # transform - imputes with chosen strategy
hungarian_disc_imp = pd.DataFrame(imputed_disc, columns=hungarian_disc.columns, index=hungarian.index) # back to pandas DataFrame
hungarian_disc_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   fbs      294 non-null    float64
 1   restecg  294 non-null    float64
 2   exang    294 non-null    float64
dtypes: float64(3)
memory usage: 7.0 KB


  mode = stats.mode(array)


In [78]:
# NEED TO COMBINE cat, cont, and remaining
from sklearn.impute import SimpleImputer
# impute continuous values using mean
imputeMean = SimpleImputer(strategy="mean") # create mode imputer
hungarian_cont = pd.concat([hungarian.pop(x) for x in ['trestbps', 'chol', 'thalach']], axis=1) # isolate continuous
imputeMean.fit(hungarian_cont) # fit - learns the data
imputed_cont = imputeMean.transform(hungarian_cont) # transform - imputes with chosen strategy
hungarian_cont_imp = pd.DataFrame(imputed_cont, columns=hungarian_cont.columns, index=hungarian.index) # back to pandas DataFrame
hungarian_cont_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  294 non-null    float64
 1   chol      294 non-null    float64
 2   thalach   294 non-null    float64
dtypes: float64(3)
memory usage: 7.0 KB


In [79]:
hungarian_imp = pd.concat((hungarian_cont_imp, hungarian_disc_imp, hungarian), axis=1)

In [80]:
hungarian_imp.head() # check for anything obviously wonky

Unnamed: 0,trestbps,chol,thalach,fbs,restecg,exang,age,sex,cp,oldpeak,num
0,140.0,289.0,172.0,0.0,0.0,0.0,40.0,1.0,2.0,0.0,0.0
1,160.0,180.0,156.0,0.0,0.0,0.0,49.0,0.0,3.0,1.0,1.0
2,130.0,283.0,98.0,0.0,1.0,0.0,37.0,1.0,2.0,0.0,0.0
3,138.0,214.0,108.0,0.0,0.0,1.0,48.0,0.0,4.0,1.5,3.0
4,150.0,250.848708,122.0,0.0,0.0,0.0,54.0,1.0,3.0,0.0,0.0


In [81]:
hungarian_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  294 non-null    float64
 1   chol      294 non-null    float64
 2   thalach   294 non-null    float64
 3   fbs       294 non-null    float64
 4   restecg   294 non-null    float64
 5   exang     294 non-null    float64
 6   age       294 non-null    float64
 7   sex       294 non-null    float64
 8   cp        294 non-null    float64
 9   oldpeak   294 non-null    float64
 10  num       294 non-null    float64
dtypes: float64(11)
memory usage: 25.4 KB


#### BINARY LABEL
(multiclass for "future research")

In [82]:
hungarian_imp['num'][hungarian_imp['num'] > 0] = 1
hungarian_imp.head()

Unnamed: 0,trestbps,chol,thalach,fbs,restecg,exang,age,sex,cp,oldpeak,num
0,140.0,289.0,172.0,0.0,0.0,0.0,40.0,1.0,2.0,0.0,0.0
1,160.0,180.0,156.0,0.0,0.0,0.0,49.0,0.0,3.0,1.0,1.0
2,130.0,283.0,98.0,0.0,1.0,0.0,37.0,1.0,2.0,0.0,0.0
3,138.0,214.0,108.0,0.0,0.0,1.0,48.0,0.0,4.0,1.5,1.0
4,150.0,250.848708,122.0,0.0,0.0,0.0,54.0,1.0,3.0,0.0,0.0


#### STRATIFIED SAMPLING
(due to difference in sex representation)

In [83]:
hungarian_imp_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in hungarian_imp_strat.split(hungarian_imp, hungarian_imp["num"]):
  hungarian_imp_strat_train = hungarian_imp.loc[train_index]
  hungarian_imp_strat_test = hungarian_imp.loc[test_index]

In [84]:
hungarian_imp_strat_test["num"].value_counts()/len(hungarian_imp_strat_test)

0.0    0.644068
1.0    0.355932
Name: num, dtype: float64

In [85]:
hungarian_imp_strat_train["num"].value_counts()/len(hungarian_imp_strat_train)

0.0    0.638298
1.0    0.361702
Name: num, dtype: float64

Less significant for Cleveland and Statlog datasets, which have less dramatic difference in sex representation in data, but different for other three datasets.

In [86]:
# split stratified data
hungarian_imp_strat_train_X = hungarian_imp_strat_train.drop([heart_label], axis=1)
hungarian_imp_strat_train_y = hungarian_imp_strat_train[heart_label]
hungarian_imp_strat_test_X = hungarian_imp_strat_test.drop([heart_label], axis=1)
hungarian_imp_strat_test_y = hungarian_imp_strat_test[heart_label]
hungarian_imp_strat_train_X.sample(5)

Unnamed: 0,trestbps,chol,thalach,fbs,restecg,exang,age,sex,cp,oldpeak
251,135.0,491.0,135.0,0.0,0.0,0.0,44.0,1.0,4.0,0.0
36,140.0,306.0,87.0,1.0,0.0,1.0,65.0,1.0,4.0,1.5
241,145.0,326.0,155.0,0.0,0.0,0.0,55.0,1.0,2.0,0.0
106,120.0,254.0,110.0,0.0,1.0,0.0,48.0,0.0,4.0,0.0
191,180.0,280.0,120.0,0.0,1.0,0.0,46.0,1.0,4.0,0.0


### BINARY MODEL

In [87]:
# Create an instance and then fit the train data
hhun_TabPre = TabularPredictor(label=heart_label, path='.').fit(hungarian_imp_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    235
Train Data Columns: 10
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    328.04 MB
	Train Data (Original)  Memory Usage: 0.02 MB (0.0% of available memory)
	

In [88]:
hhun_pred = hhun_TabPre.predict(hungarian_imp_strat_test_X)
hhun_perf = hhun_TabPre.evaluate_predictions(y_true=hungarian_imp_strat_test_y, y_pred=hhun_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.847457627118644
Evaluations on test data:
{
    "accuracy": 0.847457627118644,
    "balanced_accuracy": 0.8283208020050126,
    "mcc": 0.6641750237558575,
    "f1": 0.7804878048780488,
    "precision": 0.8,
    "recall": 0.7619047619047619
}


In [89]:
hhun_perf2 =  hhun_TabPre.evaluate(data=hungarian_imp_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.847457627118644
Evaluations on test data:
{
    "accuracy": 0.847457627118644,
    "balanced_accuracy": 0.8283208020050126,
    "mcc": 0.6641750237558575,
    "roc_auc": 0.8671679197994987,
    "f1": 0.7804878048780488,
    "precision": 0.8,
    "recall": 0.7619047619047619
}


In [90]:
hhun_perf3 =  hhun_TabPre.evaluate(data=hungarian_imp_strat_test, auxiliary_metrics=True, model='ExtraTreesEntr')

Evaluation: accuracy on test data: 0.8305084745762712
Evaluations on test data:
{
    "accuracy": 0.8305084745762712,
    "balanced_accuracy": 0.8151629072681704,
    "mcc": 0.6303258145363408,
    "roc_auc": 0.869047619047619,
    "f1": 0.7619047619047619,
    "precision": 0.7619047619047619,
    "recall": 0.7619047619047619
}


In [96]:
hhun_perf4 =  hhun_TabPre.evaluate(data=hungarian_imp_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.847457627118644
Evaluations on test data:
{
    "accuracy": 0.847457627118644,
    "balanced_accuracy": 0.8283208020050126,
    "mcc": 0.6641750237558575,
    "roc_auc": 0.8671679197994987,
    "f1": 0.7804878048780488,
    "precision": 0.8,
    "recall": 0.7619047619047619
}


In [97]:
#Show the leaderboard
hhun_TabPre.leaderboard(hungarian_imp_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.847458,0.851064,0.026202,0.019715,0.250962,0.026202,0.019715,0.250962,1,True,4
1,WeightedEnsemble_L2,0.847458,0.851064,0.031569,0.019977,0.337367,0.005367,0.000262,0.086405,2,True,8
2,RandomForestGini,0.847458,0.829787,0.034598,0.019714,0.279578,0.034598,0.019714,0.279578,1,True,3
3,ExtraTreesEntr,0.830508,0.851064,0.03086,0.021339,0.260072,0.03086,0.021339,0.260072,1,True,6
4,XGBoost,0.813559,0.744681,0.023582,0.00204,0.021139,0.023582,0.00204,0.021139,1,True,7
5,ExtraTreesGini,0.813559,0.829787,0.028561,0.020302,0.249695,0.028561,0.020302,0.249695,1,True,5
6,KNeighborsDist,0.694915,0.595745,0.00278,0.001166,0.003935,0.00278,0.001166,0.003935,1,True,2
7,KNeighborsUnif,0.644068,0.595745,0.027813,0.001669,0.006533,0.027813,0.001669,0.006533,1,True,1


In [98]:
hhun_TabPre.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.851064,0.019715,0.250962,0.019715,0.250962,1,True,4
1,WeightedEnsemble_L2,0.851064,0.019977,0.337367,0.000262,0.086405,2,True,8
2,ExtraTreesEntr,0.851064,0.021339,0.260072,0.021339,0.260072,1,True,6
3,RandomForestGini,0.829787,0.019714,0.279578,0.019714,0.279578,1,True,3
4,ExtraTreesGini,0.829787,0.020302,0.249695,0.020302,0.249695,1,True,5
5,XGBoost,0.744681,0.00204,0.021139,0.00204,0.021139,1,True,7
6,KNeighborsDist,0.595745,0.001166,0.003935,0.001166,0.003935,1,True,2
7,KNeighborsUnif,0.595745,0.001669,0.006533,0.001669,0.006533,1,True,1


In [99]:
#Show the best model
hhun_TabPre.get_model_best()

'WeightedEnsemble_L2'

## REDUCED FOR CORRELATION
(incl. no missing values & stratified sampling)

### DATA PREPROCESSING

#### REMOVE UNCORRELATED ATTRIBUTES

In [100]:
# check correlation
hungarian_imp_correlation = hungarian_imp.corr()
hungarian_imp_correlation['num'].sort_values()

thalach    -0.331074
restecg    -0.031988
trestbps    0.139582
age         0.159315
fbs         0.162869
chol        0.202372
sex         0.272781
cp          0.505864
oldpeak     0.545700
exang       0.584541
num         1.000000
Name: num, dtype: float64

In [101]:
# eliminate columns with correlation less than abs(0.2)
drop_columns = ['age', 'fbs', 'trestbps', 'restecg']
hungarian_imp_downsized = hungarian_imp.drop(columns = drop_columns)
hungarian_imp_downsized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   chol     294 non-null    float64
 1   thalach  294 non-null    float64
 2   exang    294 non-null    float64
 3   sex      294 non-null    float64
 4   cp       294 non-null    float64
 5   oldpeak  294 non-null    float64
 6   num      294 non-null    float64
dtypes: float64(7)
memory usage: 16.2 KB


#### STRATIFIED SPLIT OF DATA

In [102]:
hungarian_imp_downsized_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in hungarian_imp_downsized_strat.split(hungarian_imp_downsized, hungarian_imp_downsized["num"]):
  hungarian_imp_downsized_strat_train = hungarian_imp_downsized.loc[train_index]
  hungarian_imp_downsized_strat_test = hungarian_imp_downsized.loc[test_index]

In [103]:
hungarian_imp_downsized_strat_test["num"].value_counts()/len(hungarian_imp_downsized_strat_test)

0.0    0.644068
1.0    0.355932
Name: num, dtype: float64

In [104]:
# split stratified data
hungarian_imp_downsized_strat_train_X = hungarian_imp_downsized_strat_train.drop([heart_label], axis=1)
hungarian_imp_downsized_strat_train_y = hungarian_imp_downsized_strat_train[heart_label]
hungarian_imp_downsized_strat_test_X = hungarian_imp_downsized_strat_test.drop([heart_label], axis=1)
hungarian_imp_downsized_strat_test_y = hungarian_imp_downsized_strat_test[heart_label]
hungarian_imp_downsized_strat_train_X.sample(5)

Unnamed: 0,chol,thalach,exang,sex,cp,oldpeak
241,326.0,155.0,0.0,1.0,2.0,0.0
154,291.0,160.0,0.0,1.0,2.0,0.0
107,214.0,168.0,0.0,1.0,2.0,0.0
121,272.0,139.0,0.0,0.0,3.0,0.0
231,315.0,158.0,0.0,1.0,4.0,0.0


### BINARY MODELING

In [105]:
# Create an instance and then fit the train data
hhhun_TabPre = TabularPredictor(label=heart_label, path='.').fit(hungarian_imp_downsized_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    235
Train Data Columns: 6
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    341.52 MB
	Train Data (Original)  Memory Usage: 0.01 MB (0.0% of available memory)
	I

In [106]:
hhhun_pred = hhhun_TabPre.predict(hungarian_imp_downsized_strat_test_X)
hhhun_perf = hhhun_TabPre.evaluate_predictions(y_true=hungarian_imp_downsized_strat_test_y, y_pred=hhhun_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8983050847457628
Evaluations on test data:
{
    "accuracy": 0.8983050847457628,
    "balanced_accuracy": 0.8784461152882206,
    "mcc": 0.7755837421854133,
    "f1": 0.8500000000000001,
    "precision": 0.8947368421052632,
    "recall": 0.8095238095238095
}


In [107]:
hhhun_perf2 =  hhhun_TabPre.evaluate(data=hungarian_imp_downsized_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.8983050847457628
Evaluations on test data:
{
    "accuracy": 0.8983050847457628,
    "balanced_accuracy": 0.8784461152882206,
    "mcc": 0.7755837421854133,
    "roc_auc": 0.8784461152882207,
    "f1": 0.8500000000000001,
    "precision": 0.8947368421052632,
    "recall": 0.8095238095238095
}


In [108]:
hhhun_perf2 =  hhhun_TabPre.evaluate(data=hungarian_imp_downsized_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.8983050847457628
Evaluations on test data:
{
    "accuracy": 0.8983050847457628,
    "balanced_accuracy": 0.8784461152882206,
    "mcc": 0.7755837421854133,
    "roc_auc": 0.8784461152882207,
    "f1": 0.8500000000000001,
    "precision": 0.8947368421052632,
    "recall": 0.8095238095238095
}


In [109]:
hhhun_perf2 =  hhhun_TabPre.evaluate(data=hungarian_imp_downsized_strat_test, auxiliary_metrics=True, model='ExtraTreesGini')

Evaluation: accuracy on test data: 0.864406779661017
Evaluations on test data:
{
    "accuracy": 0.864406779661017,
    "balanced_accuracy": 0.8521303258145363,
    "mcc": 0.7042606516290727,
    "roc_auc": 0.880325814536341,
    "f1": 0.8095238095238095,
    "precision": 0.8095238095238095,
    "recall": 0.8095238095238095
}


In [110]:
#Show the leaderboard
hhhun_TabPre.leaderboard(hungarian_imp_downsized_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.898305,0.87234,0.027316,0.019844,0.247765,0.027316,0.019844,0.247765,1,True,4
1,WeightedEnsemble_L2,0.898305,0.87234,0.028725,0.020872,0.337257,0.001409,0.001028,0.089492,2,True,8
2,RandomForestGini,0.898305,0.851064,0.05383,0.019957,0.264865,0.05383,0.019957,0.264865,1,True,3
3,ExtraTreesGini,0.864407,0.87234,0.027277,0.019821,0.247021,0.027277,0.019821,0.247021,1,True,5
4,ExtraTreesEntr,0.864407,0.87234,0.04144,0.019889,0.249101,0.04144,0.019889,0.249101,1,True,6
5,XGBoost,0.830508,0.851064,0.006706,0.002896,0.023464,0.006706,0.002896,0.023464,1,True,7
6,KNeighborsUnif,0.644068,0.617021,0.008847,0.001958,0.040769,0.008847,0.001958,0.040769,1,True,1
7,KNeighborsDist,0.627119,0.659574,0.00427,0.001859,0.005329,0.00427,0.001859,0.005329,1,True,2


In [111]:
hhhun_TabPre.leaderboard( silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini,0.87234,0.019821,0.247021,0.019821,0.247021,1,True,5
1,RandomForestEntr,0.87234,0.019844,0.247765,0.019844,0.247765,1,True,4
2,ExtraTreesEntr,0.87234,0.019889,0.249101,0.019889,0.249101,1,True,6
3,WeightedEnsemble_L2,0.87234,0.020872,0.337257,0.001028,0.089492,2,True,8
4,XGBoost,0.851064,0.002896,0.023464,0.002896,0.023464,1,True,7
5,RandomForestGini,0.851064,0.019957,0.264865,0.019957,0.264865,1,True,3
6,KNeighborsDist,0.659574,0.001859,0.005329,0.001859,0.005329,1,True,2
7,KNeighborsUnif,0.617021,0.001958,0.040769,0.001958,0.040769,1,True,1


In [112]:
#Show the best model
hhhun_TabPre.get_model_best()

'WeightedEnsemble_L2'

# COMBINED

## TAKE A PEEK

In [113]:
# load Switzerland datafile
switzerland_csv_path = "processed.switzerland.data"
switzerland_data = pd.read_csv(switzerland_csv_path, header = None)
# assign column names
switzerland_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

In [114]:
# load VA datafile
va_csv_path = "processed.va.data"
va_data = pd.read_csv(va_csv_path, header = None)
# assign column names
va_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

In [115]:
switzerland_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


In [116]:
va_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,4,140,260,0,1,112,1,3.0,2,?,?,2
1,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0
2,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2
3,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1
4,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0


In [117]:
switzerland_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       123 non-null    int64 
 1   sex       123 non-null    int64 
 2   cp        123 non-null    int64 
 3   trestbps  123 non-null    object
 4   chol      123 non-null    int64 
 5   fbs       123 non-null    object
 6   restecg   123 non-null    object
 7   thalach   123 non-null    object
 8   exang     123 non-null    object
 9   oldpeak   123 non-null    object
 10  slope     123 non-null    object
 11  ca        123 non-null    object
 12  thal      123 non-null    object
 13  num       123 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 13.6+ KB


In [118]:
va_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       200 non-null    int64 
 1   sex       200 non-null    int64 
 2   cp        200 non-null    int64 
 3   trestbps  200 non-null    object
 4   chol      200 non-null    object
 5   fbs       200 non-null    object
 6   restecg   200 non-null    int64 
 7   thalach   200 non-null    object
 8   exang     200 non-null    object
 9   oldpeak   200 non-null    object
 10  slope     200 non-null    object
 11  ca        200 non-null    object
 12  thal      200 non-null    object
 13  num       200 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 22.0+ KB


## DATA PREPROCESSING

### COPY DATASETS

In [119]:
switzerland = switzerland_data.copy()
va = va_data.copy()

### MISSING VALUES

#### CONVERT ? TO NAN

In [120]:
switzerland = switzerland.replace('?', np.NaN) # replace '?' values with NaN so can impute
switzerland[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']].astype(float) # convert dtype
switzerland.tail() # to verify didn't mess up dataframe IDs

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
118,70,1,4,115,0,0.0,1,92,1,0.0,2,,7.0,1
119,70,1,4,140,0,1.0,0,157,1,2.0,2,,7.0,3
120,72,1,3,160,0,,2,114,0,1.6,2,2.0,,0
121,73,0,3,160,0,0.0,1,121,0,0.0,1,,3.0,1
122,74,1,2,145,0,,1,123,0,1.3,1,,,1


In [121]:
# make dataset version for plotting purposes
switzerland_for_plot = switzerland.copy()

In [122]:
va = va.replace('?', np.NaN) # replace '?' values with NaN so can impute
va[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']].astype(float) # convert dtype
va.tail() # to verify didn't mess up dataframe IDs

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
195,54,0,4,127.0,333,1,1,154.0,0.0,0.0,,,,1
196,62,1,1,,139,0,1,,,,,,,0
197,55,1,4,122.0,223,1,1,100.0,0.0,0.0,,,6.0,2
198,58,1,4,,385,1,2,,,,,,,0
199,62,1,2,120.0,254,0,2,93.0,1.0,0.0,,,,1


In [123]:
# make dataset version for plotting purposes
va_for_plot = va.copy()

#### DROP IF MISSING >50%

In [124]:
switzerland.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       123 non-null    int64 
 1   sex       123 non-null    int64 
 2   cp        123 non-null    int64 
 3   trestbps  121 non-null    object
 4   chol      123 non-null    int64 
 5   fbs       48 non-null     object
 6   restecg   122 non-null    object
 7   thalach   122 non-null    object
 8   exang     122 non-null    object
 9   oldpeak   117 non-null    object
 10  slope     106 non-null    object
 11  ca        5 non-null      object
 12  thal      71 non-null     object
 13  num       123 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 13.6+ KB


In [125]:
va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       200 non-null    int64 
 1   sex       200 non-null    int64 
 2   cp        200 non-null    int64 
 3   trestbps  144 non-null    object
 4   chol      193 non-null    object
 5   fbs       193 non-null    object
 6   restecg   200 non-null    int64 
 7   thalach   147 non-null    object
 8   exang     147 non-null    object
 9   oldpeak   144 non-null    object
 10  slope     98 non-null     object
 11  ca        2 non-null      object
 12  thal      34 non-null     object
 13  num       200 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 22.0+ KB


In [126]:
# Switzerland - drop fbs, ca, and thal due to >50% of values missing
# VA - drop slope, ca, and thal due to >50% of values missing
# as combined dataset - drop fbs, slope, ca, and thal
switzerland_mod = switzerland.copy()
switzerland_mod.drop(['fbs', 'slope', 'ca', 'thal'], axis=1, inplace=True)
va_mod = va.copy()
va_mod.drop(['fbs', 'slope', 'ca', 'thal'], axis=1, inplace=True)

#### IMPUTE TO MODE OR MEAN

##### SWITZERLAND

In [127]:
switzerland_mod.info()
# to impute -
#    to mode: restecg, exang
#    to mean: trestbps, thalach, oldpeak

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       123 non-null    int64 
 1   sex       123 non-null    int64 
 2   cp        123 non-null    int64 
 3   trestbps  121 non-null    object
 4   chol      123 non-null    int64 
 5   restecg   122 non-null    object
 6   thalach   122 non-null    object
 7   exang     122 non-null    object
 8   oldpeak   117 non-null    object
 9   num       123 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 9.7+ KB


In [128]:
# impute discrete values using mode ()
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
switzerland_mod_disc = pd.concat([switzerland_mod.pop(x) for x in ['restecg', 'exang']], axis=1) # isolate discrete
imputeMode.fit(switzerland_mod_disc) # fit - learns the data
imputed_disc = imputeMode.transform(switzerland_mod_disc) # transform - imputes with chosen strategy
switzerland_mod_disc_imp = pd.DataFrame(imputed_disc, columns=switzerland_mod_disc.columns, index=switzerland_mod.index) # back to pandas DataFrame
switzerland_mod_disc_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   restecg  123 non-null    object
 1   exang    123 non-null    object
dtypes: object(2)
memory usage: 2.0+ KB


In [129]:
# impute continuous values using mean
imputeMean = SimpleImputer(strategy="mean") # create mode imputer
switzerland_mod_cont = pd.concat([switzerland_mod.pop(x) for x in ['trestbps', 'oldpeak', 'thalach']], axis=1) # isolate continuous
imputeMean.fit(switzerland_mod_cont) # fit - learns the data
imputed_cont = imputeMean.transform(switzerland_mod_cont) # transform - imputes with chosen strategy
switzerland_mod_cont_imp = pd.DataFrame(imputed_cont, columns=switzerland_mod_cont.columns, index=switzerland_mod.index) # back to pandas DataFrame
switzerland_mod_cont_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  123 non-null    float64
 1   oldpeak   123 non-null    float64
 2   thalach   123 non-null    float64
dtypes: float64(3)
memory usage: 3.0 KB


In [130]:
# Combine cat, cont, and remaining
switzerland_mod_imp = pd.concat((switzerland_mod_cont_imp, switzerland_mod_disc_imp, switzerland_mod), axis=1)

In [131]:
switzerland_mod_imp.head() # check for anything obviously wonky

Unnamed: 0,trestbps,oldpeak,thalach,restecg,exang,age,sex,cp,chol,num
0,95.0,0.7,127.0,0,0,32,1,1,0,1
1,115.0,0.2,154.0,0,0,34,1,4,0,1
2,130.206612,0.653846,130.0,0,1,35,1,4,0,3
3,110.0,1.0,125.0,0,1,36,1,4,0,1
4,105.0,2.8,166.0,0,0,38,0,4,0,2


In [132]:
switzerland_mod_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  123 non-null    float64
 1   oldpeak   123 non-null    float64
 2   thalach   123 non-null    float64
 3   restecg   123 non-null    object 
 4   exang     123 non-null    object 
 5   age       123 non-null    int64  
 6   sex       123 non-null    int64  
 7   cp        123 non-null    int64  
 8   chol      123 non-null    int64  
 9   num       123 non-null    int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 9.7+ KB


##### VA

In [133]:
 va_mod.info()
# to impute -
#    to mode: exang
#    to mean: trestbps, thalach, oldpeak, chol

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       200 non-null    int64 
 1   sex       200 non-null    int64 
 2   cp        200 non-null    int64 
 3   trestbps  144 non-null    object
 4   chol      193 non-null    object
 5   restecg   200 non-null    int64 
 6   thalach   147 non-null    object
 7   exang     147 non-null    object
 8   oldpeak   144 non-null    object
 9   num       200 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 15.8+ KB


In [134]:
# impute discrete values using mode ()
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
va_mod_disc = pd.concat([va_mod.pop(x) for x in ['exang']], axis=1) # isolate discrete
imputeMode.fit(va_mod_disc) # fit - learns the data
imputed_disc = imputeMode.transform(va_mod_disc) # transform - imputes with chosen strategy
va_mod_disc_imp = pd.DataFrame(imputed_disc, columns=va_mod_disc.columns, index=va_mod.index) # back to pandas DataFrame
va_mod_disc_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   exang   200 non-null    object
dtypes: object(1)
memory usage: 1.7+ KB


In [135]:
# impute continuous values using mean
imputeMean = SimpleImputer(strategy="mean") # create mode imputer
va_mod_cont = pd.concat([va_mod.pop(x) for x in ['trestbps', 'chol', 'oldpeak', 'thalach']], axis=1) # isolate continuous
imputeMean.fit(va_mod_cont) # fit - learns the data
imputed_cont = imputeMean.transform(va_mod_cont) # transform - imputes with chosen strategy
va_mod_cont_imp = pd.DataFrame(imputed_cont, columns=va_mod_cont.columns, index=va_mod.index) # back to pandas DataFrame
va_mod_cont_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  200 non-null    float64
 1   chol      200 non-null    float64
 2   oldpeak   200 non-null    float64
 3   thalach   200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [136]:
# Combine cat, cont, and remaining
va_mod_imp = pd.concat((va_mod_cont_imp, va_mod_disc_imp, va_mod), axis=1)

In [137]:
va_mod_imp.head() # check for anything obviously wonky

Unnamed: 0,trestbps,chol,oldpeak,thalach,exang,age,sex,cp,restecg,num
0,140.0,260.0,3.0,112.0,1,63,1,4,1,2
1,130.0,209.0,0.0,127.0,0,44,1,4,1,0
2,132.0,218.0,1.5,140.0,1,60,1,4,1,2
3,142.0,228.0,2.5,149.0,1,55,1,4,1,1
4,110.0,213.0,1.3,99.0,1,66,1,3,2,0


In [138]:
va_mod_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  200 non-null    float64
 1   chol      200 non-null    float64
 2   oldpeak   200 non-null    float64
 3   thalach   200 non-null    float64
 4   exang     200 non-null    object 
 5   age       200 non-null    int64  
 6   sex       200 non-null    int64  
 7   cp        200 non-null    int64  
 8   restecg   200 non-null    int64  
 9   num       200 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 15.8+ KB


### BINARY LABEL

In [139]:
switzerland_mod_imp['num'][switzerland_mod_imp['num'] > 0] = 1
switzerland_mod_imp.value_counts('num')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  switzerland_mod_imp['num'][switzerland_mod_imp['num'] > 0] = 1


num
1    115
0      8
dtype: int64

In [140]:
va_mod_imp['num'][va_mod_imp['num'] > 0] = 1
va_mod_imp.value_counts('num')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  va_mod_imp['num'][va_mod_imp['num'] > 0] = 1


num
1    149
0     51
dtype: int64

### CREATE COMBINED DATASET

In [141]:
#cleveland.info()
# for combined, drop - slope, ca, thal, fbs

In [142]:
cleveland_comb = cleveland.copy()

In [143]:
cleveland_comb.drop(['fbs', 'slope', 'ca', 'thal'], axis=1, inplace=True)
#cleveland_comb.info()

In [144]:
#hungarian_imp.info()
# for combined, drop - fbs

In [145]:
hungarian_comb = hungarian_imp.copy()

In [146]:
hungarian_comb.drop(['fbs'], axis=1, inplace=True)
#hungarian_comb.info()

In [147]:
#switzerland_mod_imp.info()

In [148]:
#va_mod_imp.info()

In [149]:
combined = pd.concat((cleveland_comb, hungarian_comb, switzerland_mod_imp, va_mod_imp), axis=0, ignore_index=True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  920 non-null    float64
 4   chol      920 non-null    float64
 5   restecg   920 non-null    object 
 6   thalach   920 non-null    float64
 7   exang     920 non-null    object 
 8   oldpeak   920 non-null    float64
 9   num       920 non-null    float64
dtypes: float64(8), object(2)
memory usage: 72.0+ KB


### STRATIFIED SAMPLING

In [150]:
combined_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in combined_strat.split(combined, combined["num"]):
  combined_strat_train = combined.loc[train_index]
  combined_strat_test = combined.loc[test_index]

In [151]:
combined_strat_test["num"].value_counts()/len(combined_strat_test)

1.0    0.554348
0.0    0.445652
Name: num, dtype: float64

In [152]:
combined_strat_train["num"].value_counts()/len(combined_strat_train)

1.0    0.552989
0.0    0.447011
Name: num, dtype: float64

In [153]:
# split stratified data
combined_strat_train_X = combined_strat_train.drop([heart_label], axis=1)
combined_strat_train_y = combined_strat_train[heart_label]
combined_strat_test_X = combined_strat_test.drop([heart_label], axis=1)
combined_strat_test_y = combined_strat_test[heart_label]
combined_strat_train_X.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak
817,62.0,1.0,1.0,112.0,258.0,1.0,150.0,1.0,1.320833
580,51.0,1.0,3.0,135.0,160.0,0.0,150.0,0.0,2.0
173,62.0,0.0,4.0,140.0,394.0,2.0,157.0,0.0,1.2
766,59.0,1.0,4.0,122.0,233.0,0.0,117.0,1.0,1.3
544,55.0,1.0,2.0,145.0,326.0,0.0,155.0,0.0,0.0


## BINARY MODEL

In [154]:
# Create an instance and then fit the train data
comb_TabPre = TabularPredictor(label=heart_label, path='.').fit(combined_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    736
Train Data Columns: 9
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    337.85 MB
	Train Data (Original)  Memory Usage: 0.1 MB (0.0% of available memory)
	In

In [155]:
comb_pred = comb_TabPre.predict(combined_strat_test_X)
comb_perf = comb_TabPre.evaluate_predictions(y_true=combined_strat_test_y, y_pred=comb_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8478260869565217
Evaluations on test data:
{
    "accuracy": 0.8478260869565217,
    "balanced_accuracy": 0.8400286944045912,
    "mcc": 0.6925917533073389,
    "f1": 0.8691588785046729,
    "precision": 0.8303571428571429,
    "recall": 0.9117647058823529
}


In [156]:
comb_perf2 =  comb_TabPre.evaluate(data=combined_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.8478260869565217
Evaluations on test data:
{
    "accuracy": 0.8478260869565217,
    "balanced_accuracy": 0.8400286944045912,
    "mcc": 0.6925917533073389,
    "roc_auc": 0.9184600669536107,
    "f1": 0.8691588785046729,
    "precision": 0.8303571428571429,
    "recall": 0.9117647058823529
}


In [157]:
comb_perf3 =  comb_TabPre.evaluate(data=combined_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.8478260869565217
Evaluations on test data:
{
    "accuracy": 0.8478260869565217,
    "balanced_accuracy": 0.8400286944045912,
    "mcc": 0.6925917533073389,
    "roc_auc": 0.9149330463892874,
    "f1": 0.8691588785046729,
    "precision": 0.8303571428571429,
    "recall": 0.9117647058823529
}


In [158]:
comb_perf4 =  comb_TabPre.evaluate(data=combined_strat_test, auxiliary_metrics=True, model='ExtraTreesGini')

Evaluation: accuracy on test data: 0.842391304347826
Evaluations on test data:
{
    "accuracy": 0.842391304347826,
    "balanced_accuracy": 0.8339311334289814,
    "mcc": 0.6819075880955596,
    "roc_auc": 0.919296987087518,
    "f1": 0.8651162790697673,
    "precision": 0.8230088495575221,
    "recall": 0.9117647058823529
}


In [159]:
#Show the leaderboard
comb_TabPre.leaderboard(combined_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.847826,0.797297,0.036896,0.024798,0.2945,0.036896,0.024798,0.2945,1,True,4
1,WeightedEnsemble_L2,0.847826,0.810811,0.123012,0.074949,0.979291,0.001744,0.000401,0.10507,2,True,8
2,RandomForestGini,0.842391,0.77027,0.037669,0.025544,0.297175,0.037669,0.025544,0.297175,1,True,3
3,ExtraTreesGini,0.842391,0.783784,0.041508,0.024531,0.28753,0.041508,0.024531,0.28753,1,True,5
4,ExtraTreesEntr,0.826087,0.77027,0.046702,0.024206,0.282546,0.046702,0.024206,0.282546,1,True,6
5,XGBoost,0.809783,0.722973,0.005913,0.003386,0.023183,0.005913,0.003386,0.023183,1,True,7
6,KNeighborsDist,0.690217,0.668919,0.00367,0.001662,0.004442,0.00367,0.001662,0.004442,1,True,2
7,KNeighborsUnif,0.690217,0.662162,0.014753,0.002047,0.012023,0.014753,0.002047,0.012023,1,True,1


In [160]:
#Show the leaderboard
comb_TabPre.leaderboard( silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.810811,0.074949,0.979291,0.000401,0.10507,2,True,8
1,RandomForestEntr,0.797297,0.024798,0.2945,0.024798,0.2945,1,True,4
2,ExtraTreesGini,0.783784,0.024531,0.28753,0.024531,0.28753,1,True,5
3,ExtraTreesEntr,0.77027,0.024206,0.282546,0.024206,0.282546,1,True,6
4,RandomForestGini,0.77027,0.025544,0.297175,0.025544,0.297175,1,True,3
5,XGBoost,0.722973,0.003386,0.023183,0.003386,0.023183,1,True,7
6,KNeighborsDist,0.668919,0.001662,0.004442,0.001662,0.004442,1,True,2
7,KNeighborsUnif,0.662162,0.002047,0.012023,0.002047,0.012023,1,True,1


In [162]:
#Show the best model
comb_TabPre.get_model_best()

'WeightedEnsemble_L2'

## CORRELATED

### ADDITIONAL DATA PREPROCESSING
(starting after combined dataset created and before stratified sampling above)

In [163]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  920 non-null    float64
 4   chol      920 non-null    float64
 5   restecg   920 non-null    object 
 6   thalach   920 non-null    float64
 7   exang     920 non-null    object 
 8   oldpeak   920 non-null    float64
 9   num       920 non-null    float64
dtypes: float64(8), object(2)
memory usage: 72.0+ KB


In [164]:
# check correlation
combined_correlation = combined.corr()
combined_correlation['num'].sort_values()
# only trestbps has abs(corr) < 0.2

thalach    -0.385972
chol       -0.234679
trestbps    0.103828
age         0.282700
sex         0.307284
oldpeak     0.373382
cp          0.471712
num         1.000000
Name: num, dtype: float64

In [165]:
# eliminate columns with correlation less than abs(0.2)
drop_columns = ['trestbps']
combined_downsized = combined.drop(columns = drop_columns)
combined_downsized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      920 non-null    float64
 1   sex      920 non-null    float64
 2   cp       920 non-null    float64
 3   chol     920 non-null    float64
 4   restecg  920 non-null    object 
 5   thalach  920 non-null    float64
 6   exang    920 non-null    object 
 7   oldpeak  920 non-null    float64
 8   num      920 non-null    float64
dtypes: float64(7), object(2)
memory usage: 64.8+ KB


In [166]:
combined_downsized_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in combined_downsized_strat.split(combined_downsized, combined["num"]):
  combined_downsized_strat_train = combined_downsized.loc[train_index]
  combined_downsized_strat_test = combined_downsized.loc[test_index]

In [167]:
combined_downsized_strat_test["num"].value_counts()/len(combined_downsized_strat_test)

1.0    0.554348
0.0    0.445652
Name: num, dtype: float64

In [168]:
combined_downsized_strat_train["num"].value_counts()/len(combined_downsized_strat_train)

1.0    0.552989
0.0    0.447011
Name: num, dtype: float64

In [169]:
# split stratified data
combined_downsized_strat_train_X = combined_downsized_strat_train.drop([heart_label], axis=1)
combined_downsized_strat_train_y = combined_downsized_strat_train[heart_label]
combined_downsized_strat_test_X = combined_downsized_strat_test.drop([heart_label], axis=1)
combined_downsized_strat_test_y = combined_downsized_strat_test[heart_label]
combined_downsized_strat_train_X.sample(5)

Unnamed: 0,age,sex,cp,chol,restecg,thalach,exang,oldpeak
617,47.0,1.0,3.0,0.0,0.0,120.0,1.0,0.0
49,53.0,1.0,3.0,197.0,2.0,152.0,0.0,1.2
52,44.0,1.0,4.0,290.0,2.0,153.0,0.0,0.0
831,61.0,1.0,2.0,283.0,0.0,122.795918,1.0,1.320833
151,42.0,0.0,4.0,265.0,2.0,122.0,0.0,0.6


### BINARY MODEL - correlated

In [170]:
# Create an instance and then fit the train data
comb_downsized_TabPre = TabularPredictor(label=heart_label, path='.').fit(combined_downsized_strat_train)

Beginning AutoGluon training ...
AutoGluon will save models to "./"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.2.0: Fri Nov 11 02:04:44 PST 2022; root:xnu-8792.61.2~4/RELEASE_ARM64_T8103
Train Data Rows:    736
Train Data Columns: 8
Label Column: num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    349.51 MB
	Train Data (Original)  Memory Usage: 0.09 MB (0.0% of available memory)
	I

In [171]:
comb_downsized_pred = comb_downsized_TabPre.predict(combined_downsized_strat_test_X)
comb_downsized_perf = comb_downsized_TabPre.evaluate_predictions(
    y_true=combined_downsized_strat_test_y, y_pred=comb_downsized_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.842391304347826
Evaluations on test data:
{
    "accuracy": 0.842391304347826,
    "balanced_accuracy": 0.8351267336202775,
    "mcc": 0.6809624459644162,
    "f1": 0.863849765258216,
    "precision": 0.8288288288288288,
    "recall": 0.9019607843137255
}


In [175]:
comb_downsized_perf2 =  comb_downsized_TabPre.evaluate(data=combined_downsized_strat_test, auxiliary_metrics=True, model='WeightedEnsemble_L2')

Evaluation: accuracy on test data: 0.842391304347826
Evaluations on test data:
{
    "accuracy": 0.842391304347826,
    "balanced_accuracy": 0.8351267336202775,
    "mcc": 0.6809624459644162,
    "roc_auc": 0.9149330463892874,
    "f1": 0.863849765258216,
    "precision": 0.8288288288288288,
    "recall": 0.9019607843137255
}


In [176]:
comb_downsized_perf3 =  comb_downsized_TabPre.evaluate(data=combined_downsized_strat_test, auxiliary_metrics=True, model='RandomForestEntr')

Evaluation: accuracy on test data: 0.842391304347826
Evaluations on test data:
{
    "accuracy": 0.842391304347826,
    "balanced_accuracy": 0.8351267336202775,
    "mcc": 0.6809624459644162,
    "roc_auc": 0.9149330463892874,
    "f1": 0.863849765258216,
    "precision": 0.8288288288288288,
    "recall": 0.9019607843137255
}


In [177]:
comb_downsized_perf4 =  comb_downsized_TabPre.evaluate(data=combined_downsized_strat_test, auxiliary_metrics=True, model='RandomForestGini')

Evaluation: accuracy on test data: 0.8369565217391305
Evaluations on test data:
{
    "accuracy": 0.8369565217391305,
    "balanced_accuracy": 0.8302247728359637,
    "mcc": 0.669475140994274,
    "roc_auc": 0.9206719273075084,
    "f1": 0.8584905660377359,
    "precision": 0.8272727272727273,
    "recall": 0.8921568627450981
}


In [172]:
#Show the leaderboard
comb_downsized_TabPre.leaderboard(combined_downsized_strat_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr,0.853261,0.777027,0.054566,0.024395,0.287584,0.054566,0.024395,0.287584,1,True,6
1,RandomForestEntr,0.842391,0.810811,0.035884,0.024316,0.309347,0.035884,0.024316,0.309347,1,True,4
2,WeightedEnsemble_L2,0.842391,0.810811,0.037257,0.024707,0.413412,0.001373,0.000391,0.104065,2,True,8
3,RandomForestGini,0.836957,0.804054,0.055402,0.024281,0.297836,0.055402,0.024281,0.297836,1,True,3
4,ExtraTreesGini,0.836957,0.797297,0.07057,0.024703,0.284881,0.07057,0.024703,0.284881,1,True,5
5,XGBoost,0.809783,0.722973,0.007525,0.003518,0.02223,0.007525,0.003518,0.02223,1,True,7
6,KNeighborsUnif,0.668478,0.662162,0.014795,0.002446,0.009459,0.014795,0.002446,0.009459,1,True,1
7,KNeighborsDist,0.657609,0.655405,0.004528,0.001539,0.00467,0.004528,0.001539,0.00467,1,True,2


In [174]:
comb_downsized_TabPre.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.810811,0.024316,0.309347,0.024316,0.309347,1,True,4
1,WeightedEnsemble_L2,0.810811,0.024707,0.413412,0.000391,0.104065,2,True,8
2,RandomForestGini,0.804054,0.024281,0.297836,0.024281,0.297836,1,True,3
3,ExtraTreesGini,0.797297,0.024703,0.284881,0.024703,0.284881,1,True,5
4,ExtraTreesEntr,0.777027,0.024395,0.287584,0.024395,0.287584,1,True,6
5,XGBoost,0.722973,0.003518,0.02223,0.003518,0.02223,1,True,7
6,KNeighborsUnif,0.662162,0.002446,0.009459,0.002446,0.009459,1,True,1
7,KNeighborsDist,0.655405,0.001539,0.00467,0.001539,0.00467,1,True,2


In [173]:
#Show the best model
comb_downsized_TabPre.get_model_best()

'WeightedEnsemble_L2'