In [1]:
import numpy as np
import pandas as pd
import covidecg.data.utils as data_utils
import covidecg.features.utils as features_utils
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
import covidecg.models.mlp_models
import skorch
import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
from skorch.callbacks import EpochScoring, EarlyStopping
import sklearn.svm
import sklearn.metrics


X, y = data_utils.load_stress_ecg_runs('../data/interim/ecg_runs.csv', '../data/interim/ecg_runs')
print(X.shape)

# pre-process targets
label_encoder = sklearn.preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y).astype(np.int64)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

# compute class weights for CrossEntropyLoss to mitigate imbalanced classes
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)

(856, 12, 5000)


In [6]:
def debug_shape(x):
    print(x.shape)
    return x

def debug_dtype(x):
    print(x.dtype)
    return x

dataprep_pipe = sklearn.pipeline.make_pipeline(
    data_utils.EcgSignalCleaner(),
    features_utils.EcgLfccFeatsExtractor(),
    sklearn.pipeline.FunctionTransformer(data_utils.flatten_leads),
    )
dataprep_pipe

In [7]:
dataprep_pipe_lead_ii = sklearn.pipeline.make_pipeline(
    data_utils.EcgLeadSelector('MDC_ECG_LEAD_II'),
    dataprep_pipe,
    )
dataprep_pipe_lead_ii

In [20]:
feats = dataprep_pipe.fit_transform(X)

x: float32
cleaned_signals: float64


  return x / np.std(x)


In [21]:
print(feats.shape)
featsdf = pd.DataFrame(feats)
print(featsdf.isnull().sum().sum())

(10272, 12974)
12974


In [None]:
print(targets)

In [10]:
experiments = []

# SVM (linear kernel)
experiments.append(('lfcc__all_leads__svm_linear',
                    sklearn.pipeline.make_pipeline(dataprep_pipe, sklearn.svm.SVC(kernel='linear'))))
experiments.append(('lfcc__lead_ii__svm_linear',
                    sklearn.pipeline.make_pipeline(dataprep_pipe_lead_ii, sklearn.svm.SVC(kernel='linear'))))

# SVM (polynomial kernel)
# experiments.append(('lfcc__all_leads__svm_polyn',
#                     sklearn.pipeline.make_pipeline(dataprep_pipe, sklearn.svm.SVC(kernel='poly'))))
experiments.append(('lfcc__lead_ii__svm_polyn',
                    sklearn.pipeline.make_pipeline(dataprep_pipe_lead_ii, sklearn.svm.SVC(kernel='poly'))))

# MLP Classifier
# experiments.append(('lfcc__all_leads__mlp',
#                     sklearn.pipeline.make_pipeline(
#                             dataprep_pipe,
#                             skorch.NeuralNetClassifier(
#                             module=covidecg.models.mlp_models.MLP,
#                             # topology config
#                             module__input_size=dataprep_pipe.fit_transform(X_train[[0]]).shape[1],  # number of features in flattened sample
#                             module__hidden_size=1000,
#                             # loss config
#                             criterion=nn.CrossEntropyLoss,
#                             criterion__weight=class_weights,
#                             # hyperparams
#                             batch_size=32,
#                             lr=1e-4,
#                             optimizer__momentum=0.9,
#                             max_epochs=20,
#                             iterator_train__shuffle=True,  # Shuffle training data on each epoch
#                             callbacks=[
#                                     # additional scores to observe
#                                     EpochScoring(scoring='roc_auc', lower_is_better=False),
#                                     # Early Stopping based on validation loss
#                                     EarlyStopping(patience=3)
#                                     ]
#                             ))))
experiments.append(('lfcc__lead_ii__mlp',
                    sklearn.pipeline.make_pipeline(
                    dataprep_pipe_lead_ii,
                    skorch.NeuralNetClassifier(
                        module=covidecg.models.mlp_models.MLP,
                        # topology config
                        module__input_size=dataprep_pipe_lead_ii.fit_transform(X_train[[0]]).shape[1],  # number of features in flattened sample
                        module__hidden_size=1000,
                        # loss config
                        criterion=nn.CrossEntropyLoss,
                        criterion__weight=class_weights,
                        # hyperparams
                        batch_size=32,
                        lr=1e-4,
                        optimizer__momentum=0.9,
                        max_epochs=20,
                        iterator_train__shuffle=True,  # Shuffle training data on each epoch
                        callbacks=[
                                # additional scores to observe
                                EpochScoring(scoring='roc_auc', lower_is_better=False),
                                # Early Stopping based on validation loss
                                EarlyStopping(patience=3)
                                ]
                        ))))

for _exp_name, _pipe in experiments:
    print(f"Fitting and evaluating {_exp_name}")
    cvscore = sklearn.model_selection.cross_val_score(_pipe, X=X, y=y, cv=5, 
        scoring=sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score))
    print(F"Mean ROC/AUC Score after 5-fold cross validation: {np.mean(cvscore)}")
    print()

x: float32
cleaned_signals: float64
Fitting and evaluating lfcc__all_leads__svm_linear
x: float32
cleaned_signals: float64
x: float32
cleaned_signals: float64


  return x / np.std(x)


x: float32
cleaned_signals: float64


  return x / np.std(x)


x: float32
cleaned_signals: float64


  return x / np.std(x)


x: float32
cleaned_signals: float64


  return x / np.std(x)


ValueError: 
All the 5 fits failed.
It is is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1088, in check_X_y
    check_consistent_length(X, y)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 383, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [8208, 684]

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1070, in check_X_y
    X = check_array(
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 895, in check_array
    _assert_all_finite(
  File "/mnt/md0/user/scheuererra68323/covid-ecg/.venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 142, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html


In [None]:
# import matplotlib.pyplot as plt
# plt.plot(clf.history[:, 'train_loss'], label='train_loss')
# plt.plot(clf.history[:, 'valid_loss'], label='valid_loss')
# plt.legend()
# plt.show()

In [None]:
# sklearn.metrics.plot_confusion_matrix(pipe, X_test, y_test)