# SETUP

In [None]:
# import libraries
from google.colab import files
import pandas as pd
import numpy as np
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

Hungarian Data

In [None]:
# load hungarian datafile
hungarian_csv_path = "/reprocessed.hungarian.data"
hungarian_data = pd.read_csv(hungarian_csv_path, delimiter = ' ', header = None)
# assign column names
hungarian_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

Cleveland Data

In [None]:
# load Cleveland datafile
cleveland_csv_path = "/processed.cleveland.data"
cleveland_data = pd.read_csv(cleveland_csv_path, header = None)
# assign column names
cleveland_data.set_axis(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                  'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                  'ca', 'thal', 'num'], axis = 1, inplace = True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define label column
heart_label = 'num'

# SLIGHTLY BETTER MODEL
no missing values & stratified sampling

## DATA PREPROCESSING

### MISSING VALUES

In [None]:
# make copy of original dataframe
cleveland = cleveland_data.copy()

In [None]:
# check for missing values
cleveland.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [None]:
# replace '?' values with NaN so can impute
cleveland['thal'].replace('?', np.NaN, inplace=True)
cleveland['thal'] = cleveland['thal'].astype(float)
cleveland['ca'].replace('?', np.NaN, inplace=True)
cleveland['ca'] = cleveland['ca'].astype(float)
cleveland.tail() # to verify didn't mess up dataframe IDs

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


In [None]:
from sklearn.impute import SimpleImputer
# impute with mode as 'thal' and 'ca' (attributes w/ missing values) are discrete
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
imputeMode.fit(cleveland) # fit - learns the data
imputed = imputeMode.transform(cleveland) # transform - imputes with chosen strategy
cleveland = pd.DataFrame(imputed, columns=cleveland.columns, index=cleveland['thal'].index) # back to pandas DataFrame
cleveland.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    float64
 12  thal      303 non-null    float64
 13  num       303 non-null    float64
dtypes: float64(14)
memory usage: 33.3 KB


In [None]:
cleveland.head() # check for anything obviously wonky

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


### BINARY LABEL
(multiclass for "future research")

In [None]:
cleveland['num'][cleveland['num'] > 0] = 1
cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


### STRATIFIED SAMPLING

(due to clear significance of sex)

In [None]:
cleveland_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in cleveland_strat.split(cleveland, cleveland["num"]):
  cleveland_strat_train = cleveland.loc[train_index]
  cleveland_strat_test = cleveland.loc[test_index]

In [None]:
cleveland_strat_test["num"].value_counts()/len(cleveland_strat_test)

0.0    0.540984
1.0    0.180328
3.0    0.114754
2.0    0.114754
4.0    0.049180
Name: num, dtype: float64

In [None]:
cleveland_strat_train["num"].value_counts()/len(cleveland_strat_train)

0.0    0.541322
1.0    0.458678
Name: num, dtype: float64

Less significant for Cleveland and Statlog datasets, which have less dramatic difference in sex representation in data, but different for other three datasets. 

In [None]:
# split stratified data
cleveland_strat_train_X = cleveland_strat_train.drop([heart_label], axis=1)
cleveland_strat_train_y = cleveland_strat_train[heart_label]
cleveland_strat_test_X = cleveland_strat_test.drop([heart_label], axis=1)
cleveland_strat_test_y = cleveland_strat_test[heart_label]
cleveland_strat_train_X.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
45,58.0,1.0,3.0,112.0,230.0,0.0,2.0,165.0,0.0,2.5,2.0,1.0,7.0
77,51.0,0.0,3.0,140.0,308.0,0.0,2.0,142.0,0.0,1.5,1.0,1.0,3.0
64,54.0,1.0,4.0,120.0,188.0,0.0,0.0,113.0,0.0,1.4,2.0,1.0,7.0
20,64.0,1.0,1.0,110.0,211.0,0.0,2.0,144.0,1.0,1.8,2.0,0.0,3.0
273,71.0,0.0,4.0,112.0,149.0,0.0,0.0,125.0,0.0,1.6,2.0,0.0,3.0


## Bustin down Pycaret style

In [None]:
pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting tbats>=1.1.0
  Downloading tbats-1.1.2-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-plot>=0.3.7
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting pyod>=1.0.8
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wurlitzer
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━

In [None]:
from pycaret import classification

## Strat Tests...

In [None]:
cleveClass = classification.setup(data = cleveland_strat_train, target = 'num',  normalize = True, transformation = True, remove_multicollinearity = True, multicollinearity_threshold = 0.95)

Unnamed: 0,Description,Value
0,Session id,8677
1,Target,num
2,Target type,Multiclass
3,Original data shape,"(242, 14)"
4,Transformed data shape,"(242, 14)"
5,Transformed train set shape,"(169, 14)"
6,Transformed test set shape,"(73, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [None]:
cleveClass = classification.setup(data = cleveland_strat_train, target = 'num')

Unnamed: 0,Description,Value
0,Session id,4111
1,Target,num
2,Target type,Multiclass
3,Original data shape,"(242, 14)"
4,Transformed data shape,"(242, 14)"
5,Transformed train set shape,"(169, 14)"
6,Transformed test set shape,"(73, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [None]:
cleveClass.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6221,0.5837,0.6221,0.5738,0.581,0.3837,0.4052,0.398
ridge,Ridge Classifier,0.6154,0.0,0.6154,0.5337,0.5548,0.3342,0.3605,0.089
lda,Linear Discriminant Analysis,0.6044,0.5961,0.6044,0.5832,0.5833,0.3725,0.3841,0.117
lightgbm,Light Gradient Boosting Machine,0.6037,0.5668,0.6037,0.5447,0.5613,0.343,0.3562,0.175
rf,Random Forest Classifier,0.6033,0.5957,0.6033,0.5214,0.5446,0.3174,0.3379,0.427
gbc,Gradient Boosting Classifier,0.5982,0.5795,0.5982,0.5367,0.5598,0.3421,0.3541,1.674
xgboost,Extreme Gradient Boosting,0.5923,0.5868,0.5923,0.5571,0.564,0.3395,0.3512,0.289
et,Extra Trees Classifier,0.5919,0.5815,0.5919,0.5431,0.5576,0.331,0.3414,0.53
qda,Quadratic Discriminant Analysis,0.5563,0.5171,0.5563,0.4822,0.4976,0.3012,0.3238,0.076
dummy,Dummy Classifier,0.5386,0.35,0.5386,0.2905,0.3773,0.0,0.0,0.071


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
#cleveClass2 = classification.setup(data = cleveland_strat_train, target = 'num',  normalize = True, transformation = True, remove_multicollinearity = True, multicollinearity_threshold = 0.95, train_size = 0.7, test_data =cleveland_strat_test)
cleveClass2 = classification.setup(data = cleveland_strat_train, target = 'num', train_size = 1, test_data =cleveland_strat_test)


Unnamed: 0,Description,Value
0,Session id,8970
1,Target,num
2,Target type,Multiclass
3,Original data shape,"(303, 14)"
4,Transformed data shape,"(303, 14)"
5,Transformed train set shape,"(242, 14)"
6,Transformed test set shape,"(61, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [None]:
cleveClass2.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.6197,0.7679,0.6197,0.5641,0.5832,0.3777,0.3859,0.159
ridge,Ridge Classifier,0.599,0.0,0.599,0.5134,0.542,0.3068,0.3222,0.072
lr,Logistic Regression,0.5952,0.8138,0.5952,0.5483,0.5579,0.334,0.3432,0.457
rf,Random Forest Classifier,0.5867,0.8061,0.5867,0.5064,0.5253,0.2807,0.2999,0.537
lda,Linear Discriminant Analysis,0.5867,0.8169,0.5867,0.556,0.5618,0.3296,0.3362,0.088
et,Extra Trees Classifier,0.566,0.8087,0.566,0.5116,0.5287,0.273,0.2808,0.425
nb,Naive Bayes,0.5582,0.8096,0.5582,0.5556,0.5516,0.3204,0.3299,0.09
gbc,Gradient Boosting Classifier,0.5495,0.7836,0.5495,0.5057,0.5218,0.273,0.2778,1.337
xgboost,Extreme Gradient Boosting,0.5493,0.7845,0.5493,0.4941,0.515,0.2693,0.2767,0.254
dummy,Dummy Classifier,0.5415,0.5,0.5415,0.2935,0.3806,0.0,0.0,0.112


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
cleveClass3 = classification.setup(data = cleveland, target = 'num',  normalize = True, transformation = True, remove_multicollinearity = True, multicollinearity_threshold = 0.95)

Unnamed: 0,Description,Value
0,Session id,4905
1,Target,num
2,Target type,Multiclass
3,Original data shape,"(303, 14)"
4,Transformed data shape,"(303, 14)"
5,Transformed train set shape,"(212, 14)"
6,Transformed test set shape,"(91, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [None]:
cleveClass3.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.6072,0.0,0.6072,0.5197,0.5505,0.3182,0.333,0.393
rf,Random Forest Classifier,0.5865,0.8179,0.5865,0.5233,0.5381,0.294,0.3073,0.846
knn,K Neighbors Classifier,0.5742,0.78,0.5742,0.4916,0.5255,0.2827,0.2906,0.215
qda,Quadratic Discriminant Analysis,0.574,0.7678,0.574,0.5287,0.5378,0.2888,0.2969,0.366
lr,Logistic Regression,0.5618,0.8158,0.5618,0.5423,0.5377,0.2883,0.2946,0.518
ada,Ada Boost Classifier,0.5373,0.7165,0.5373,0.5043,0.5142,0.2687,0.2755,0.393
svm,SVM - Linear Kernel,0.5337,0.0,0.5337,0.511,0.5131,0.2589,0.2674,0.202
nb,Naive Bayes,0.5248,0.798,0.5248,0.5565,0.5287,0.2802,0.2903,0.192
dt,Decision Tree Classifier,0.504,0.6662,0.504,0.524,0.5039,0.2465,0.2529,0.101


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
from pycaret.classification import *

ridgeModel = create_model('ridge', fold = 10)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5,0.0,0.5,0.3235,0.3929,0.1326,0.1514
1,0.5909,0.0,0.5909,0.5367,0.5583,0.3613,0.3657
2,0.5714,0.0,0.5714,0.4789,0.521,0.3152,0.322
3,0.5238,0.0,0.5238,0.4386,0.4773,0.2336,0.2418
4,0.619,0.0,0.619,0.5629,0.5886,0.3563,0.3595
5,0.6667,0.0,0.6667,0.451,0.5364,0.3581,0.4263
6,0.6667,0.0,0.6667,0.5215,0.5751,0.4324,0.4709
7,0.619,0.0,0.619,0.4286,0.5048,0.2258,0.2807
8,0.5714,0.0,0.5714,0.465,0.497,0.16,0.1807
9,0.619,0.0,0.619,0.4952,0.5453,0.3412,0.3656


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_Ridgeclf = tune_model(ridgeModel)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5,0.0,0.5,0.3235,0.3929,0.1326,0.1514
1,0.5909,0.0,0.5909,0.5367,0.5583,0.3613,0.3657
2,0.5714,0.0,0.5714,0.4789,0.521,0.3102,0.3159
3,0.5238,0.0,0.5238,0.4218,0.4667,0.2105,0.2209
4,0.619,0.0,0.619,0.5629,0.5886,0.3563,0.3595
5,0.6667,0.0,0.6667,0.451,0.5364,0.3581,0.4263
6,0.6667,0.0,0.6667,0.5215,0.5751,0.4324,0.4709
7,0.619,0.0,0.619,0.4286,0.5048,0.2258,0.2807
8,0.619,0.0,0.619,0.5714,0.5333,0.2258,0.2783
9,0.619,0.0,0.619,0.4952,0.5453,0.3412,0.3656


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
predict_model(tuned_Ridgeclf)

## Hungarian/

In [None]:
hungarian_basic = hungarian_data.copy()
# convert unknowns (-9.0) to NaN
hungarian_basic.replace(-9.0, np.NaN, inplace=True)

In [None]:
# make copy for binary labeling
hungarian_basic['num'][hungarian_basic['num'] > 0] = 1
hungarian_basic.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,,,,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,,,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,,,,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,,,1.0
4,54.0,1.0,3.0,150.0,,0.0,0.0,122.0,0.0,0.0,,,,0.0


In [None]:
# try dropping slope, ca, and thal
hungarian_basic.drop(['slope', 'ca', 'thal', ], axis=1, inplace=True) # did not fix error

In [None]:
# split the binary data
from sklearn.model_selection import train_test_split
hungarian_basic_train, hungarian_basic_test = train_test_split(hungarian_basic, test_size=0.2, random_state=42)
hungarian_basic_test_labels = hungarian_basic_test[heart_label]
hungarian_basic_test_nolabel = hungarian_basic_test.drop([heart_label], axis=1)
hungarian_basic_train_labels = hungarian_basic_train[heart_label]
hungarian_basic_train_nolabel = hungarian_basic_train.drop([heart_label], axis=1)
hungarian_basic_train.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
105,57.0,1.0,2.0,140.0,260.0,1.0,0.0,140.0,0.0,0.0,0.0
274,55.0,1.0,3.0,120.0,220.0,0.0,2.0,134.0,0.0,0.0,0.0
173,53.0,1.0,2.0,140.0,320.0,0.0,0.0,162.0,0.0,0.0,0.0
198,42.0,1.0,2.0,150.0,268.0,0.0,0.0,136.0,0.0,0.0,0.0
250,49.0,1.0,4.0,130.0,341.0,0.0,0.0,120.0,1.0,1.0,1.0


more pre processing...

In [None]:
hungarian = hungarian_data.copy()

# convert unknowns (-9.0) to NaN
hungarian.replace(-9.0, np.NaN, inplace=True)

hungarian.drop(['slope', 'ca', 'thal'], axis=1, inplace=True)

In [None]:
hungarian.drop(294,axis=0, inplace=True) # may not be necessary later # something may have gone wrong earlier, requiring it now

In [None]:
from sklearn.impute import SimpleImputer
# impute discrete values using mode ()
imputeMode = SimpleImputer(strategy="most_frequent") # create mode imputer
hungarian_disc = pd.concat([hungarian.pop(x) for x in ['fbs', 'restecg', 'exang']], axis=1) # isolate discrete
imputeMode.fit(hungarian_disc) # fit - learns the data
imputed_disc = imputeMode.transform(hungarian_disc) # transform - imputes with chosen strategy
hungarian_disc_imp = pd.DataFrame(imputed_disc, columns=hungarian_disc.columns, index=hungarian.index) # back to pandas DataFrame
hungarian_disc_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   fbs      294 non-null    float64
 1   restecg  294 non-null    float64
 2   exang    294 non-null    float64
dtypes: float64(3)
memory usage: 7.0 KB


In [None]:
# NEED TO COMBINE cat, cont, and remaining
from sklearn.impute import SimpleImputer
# impute continuous values using mean
imputeMean = SimpleImputer(strategy="mean") # create mode imputer
hungarian_cont = pd.concat([hungarian.pop(x) for x in ['trestbps', 'chol', 'thalach']], axis=1) # isolate continuous
imputeMean.fit(hungarian_cont) # fit - learns the data
imputed_cont = imputeMean.transform(hungarian_cont) # transform - imputes with chosen strategy
hungarian_cont_imp = pd.DataFrame(imputed_cont, columns=hungarian_cont.columns, index=hungarian.index) # back to pandas DataFrame
hungarian_cont_imp.info() # check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   trestbps  294 non-null    float64
 1   chol      294 non-null    float64
 2   thalach   294 non-null    float64
dtypes: float64(3)
memory usage: 7.0 KB


In [None]:
hungarian_imp = pd.concat((hungarian_cont_imp, hungarian_disc_imp, hungarian), axis=1)

In [None]:
hungarian_imp['num'][hungarian_imp['num'] > 0] = 1


In [None]:
hungarian_imp_strat = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in hungarian_imp_strat.split(hungarian_imp, hungarian_imp["num"]):
  hungarian_imp_strat_train = hungarian_imp.loc[train_index]
  hungarian_imp_strat_test = hungarian_imp.loc[test_index]

In [None]:
# split stratified data
hungarian_imp_strat_train_X = hungarian_imp_strat_train.drop([heart_label], axis=1)
hungarian_imp_strat_train_y = hungarian_imp_strat_train[heart_label]
hungarian_imp_strat_test_X = hungarian_imp_strat_test.drop([heart_label], axis=1)
hungarian_imp_strat_test_y = hungarian_imp_strat_test[heart_label]
hungarian_imp_strat_train_X.sample(5)

Unnamed: 0,trestbps,chol,thalach,fbs,restecg,exang,age,sex,cp,oldpeak
174,140.0,187.0,172.0,0.0,0.0,0.0,49.0,1.0,3.0,0.0
251,135.0,491.0,135.0,0.0,0.0,0.0,44.0,1.0,4.0,0.0
265,140.0,193.0,145.0,0.0,0.0,1.0,47.0,1.0,3.0,1.0
38,120.0,250.848708,148.0,1.0,1.0,0.0,48.0,0.0,2.0,0.0
39,150.0,227.0,130.0,0.0,0.0,1.0,48.0,0.0,4.0,1.0


In [None]:
from pycaret.classification import *
hungarian_pc_setup = setup(data = hungarian_imp_strat_train, test_data = hungarian_imp_strat_test, target = 'num', session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,num
2,Target type,Binary
3,Original data shape,"(294, 11)"
4,Transformed data shape,"(294, 11)"
5,Transformed train set shape,"(235, 11)"
6,Transformed test set shape,"(59, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


In [None]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8304,0.0,0.7208,0.8012,0.7516,0.6246,0.6326,0.044
lr,Logistic Regression,0.8303,0.8929,0.7208,0.8057,0.7506,0.624,0.6355,0.717
lda,Linear Discriminant Analysis,0.8263,0.9016,0.7208,0.7914,0.747,0.6163,0.624,0.058
nb,Naive Bayes,0.8085,0.8986,0.7681,0.7277,0.7392,0.5899,0.5983,0.079
et,Extra Trees Classifier,0.8043,0.8925,0.6611,0.7841,0.7001,0.5601,0.5755,0.438
qda,Quadratic Discriminant Analysis,0.8,0.8742,0.7431,0.7368,0.7256,0.5708,0.5848,0.057
gbc,Gradient Boosting Classifier,0.7877,0.8875,0.6639,0.7394,0.6776,0.5258,0.5414,0.257
xgboost,Extreme Gradient Boosting,0.7786,0.8656,0.6597,0.7328,0.6786,0.5133,0.5271,0.21
lightgbm,Light Gradient Boosting Machine,0.7746,0.8636,0.6736,0.7089,0.6684,0.503,0.5206,0.218
ada,Ada Boost Classifier,0.7745,0.7775,0.6028,0.7354,0.6525,0.4907,0.5034,0.242


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
ridge = create_model('ridge')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8333,0.0,0.6667,0.8571,0.75,0.6279,0.6391
1,0.875,0.0,0.7778,0.875,0.8235,0.7273,0.7303
2,0.7917,0.0,0.6667,0.75,0.7059,0.5455,0.5477
3,0.7083,0.0,0.4444,0.6667,0.5333,0.3333,0.3478
4,0.7917,0.0,0.7778,0.7,0.7368,0.5652,0.5674
5,0.8696,0.0,0.625,1.0,0.7692,0.6849,0.7217
6,0.8696,0.0,0.875,0.7778,0.8235,0.7206,0.7238
7,0.8696,0.0,0.75,0.8571,0.8,0.7039,0.7073
8,0.8696,0.0,0.875,0.7778,0.8235,0.7206,0.7238
9,0.8261,0.0,0.75,0.75,0.75,0.6167,0.6167


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_ridge = tune_model(ridge)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8333,0.0,0.6667,0.8571,0.75,0.6279,0.6391
1,0.875,0.0,0.7778,0.875,0.8235,0.7273,0.7303
2,0.7917,0.0,0.6667,0.75,0.7059,0.5455,0.5477
3,0.7083,0.0,0.4444,0.6667,0.5333,0.3333,0.3478
4,0.8333,0.0,0.7778,0.7778,0.7778,0.6444,0.6444
5,0.8696,0.0,0.625,1.0,0.7692,0.6849,0.7217
6,0.8261,0.0,0.75,0.75,0.75,0.6167,0.6167
7,0.913,0.0,0.75,1.0,0.8571,0.7965,0.8135
8,0.913,0.0,1.0,0.8,0.8889,0.8189,0.8327
9,0.7826,0.0,0.625,0.7143,0.6667,0.5064,0.5089


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
predict_model(tuned_ridge)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.8644,0.8308,0.7143,0.8824,0.7895,0.6911,0.6995


Unnamed: 0,trestbps,chol,thalach,fbs,restecg,exang,age,sex,cp,oldpeak,num,prediction_label
293,130.0,182.0,148.0,0.0,0.0,0.0,53.0,1.0,4.0,0.0,0.0,0
246,120.0,171.0,137.0,0.0,0.0,0.0,54.0,1.0,1.0,2.0,0.0,0
171,120.0,243.0,160.0,0.0,0.0,0.0,29.0,1.0,2.0,0.0,0.0,0
290,120.0,166.0,180.0,0.0,0.0,0.0,36.0,1.0,2.0,0.0,0.0,0
287,140.0,250.848709,140.0,0.0,0.0,0.0,59.0,1.0,4.0,0.0,0.0,0
207,120.0,308.0,180.0,0.0,2.0,0.0,35.0,1.0,2.0,0.0,0.0,0
140,160.0,331.0,94.0,0.0,0.0,1.0,52.0,1.0,4.0,2.5,1.0,1
257,130.0,394.0,150.0,0.0,2.0,0.0,55.0,0.0,2.0,0.0,0.0,0
104,118.0,186.0,124.0,0.0,0.0,0.0,46.0,1.0,4.0,0.0,1.0,0
83,160.0,196.0,165.0,0.0,0.0,0.0,52.0,1.0,2.0,0.0,0.0,0
