# Setup

In [1]:
# Project Root directory
project_root = ".."

In [2]:
import random, os
import numpy as np

if "pipeline" not in os.listdir():
    os.chdir(project_root)
    assert "pipeline" in os.listdir()

from pipeline.preprocessing import \
    build_feature_extractor, \
    TCDPdata, \
    gen_datesets, \
    cross_train

# constant
dataset_root = "assets/the-circor-digiscope-phonocardiogram-dataset-1.0.3"

# Variables

In [3]:
from DL.models import MultiScaleCNN

class args:
    
    cutoff_frequency = 2000 # use 0 to disable bandpass filter

    use_features = [
        "mel_2d"
    ]

    use_X = [
        "raw",
        "scaled",
        "minmax",
    ]

    train_size = 0.8

    random_state = 2024

    use_models = {
        "CNN2D": {
            "class": MultiScaleCNN,
            "kwargs": {
                "input_shape": [128, None, 1],  # (n_mels, time_frames, channels)
                "num_classes": 2,
                "learning_rate": 0.001,
                "epochs": 1,
                "batch_size": 32,
            }
        }
    }

    def set_n_time_frames(n_time_frames: int):
        args.use_models["CNN2D"]["kwargs"]['input_shape'][1] = n_time_frames

random.seed(args.random_state)
np.random.seed(args.random_state)

# Extract Features

In [4]:
extract_features = build_feature_extractor(
    args.use_features,
    args.cutoff_frequency
)
features, labels = TCDPdata(dataset_root).getXy(extract_features)

## defensive checking
# assert len(labels) == 3159
# assert sum(labels) == 1632
print('n features:', features.shape[1])
print('mean of labels:', labels.mean())

100%|██████████| 3159/3159 [02:40<00:00, 19.64it/s]


n features: 128
mean of labels: 0.51661918328585


# Generate Dataset

In [5]:
# features matrix has 3 dimensions: 0 for files, 1 for channels, 2 for time
# We want to normalize each channel(axis=1) separately,
# so the normalizer will move along axis 0 and 2. (normalize_axis=(0, 2))
X, y = gen_datesets(features, labels, args.use_X, args.train_size, args.random_state,
                    normalize_axis=(0, 2))

for x_type in X.keys():
    print(x_type, X[x_type]['train'].shape, X[x_type]['test'].shape)

raw (2527, 128, 505) (632, 128, 505)
scaled (2527, 128, 505) (632, 128, 505)
minmax (2527, 128, 505) (632, 128, 505)


In [6]:
n_time_frames = X['raw']['train'].shape[2]
args.set_n_time_frames(n_time_frames)
print(f"Model has aligned its input layer with n_time_frames: {n_time_frames}")

Model has aligned its input layer with n_time_frames: 505


# Train Models

In [7]:
"""
In case you run the next cell accidently,
which can make you lose all the data.
You need to run the cell first before the next one.
"""
models = {}
scores = {}

In [8]:
assert len(scores) == len(models) == 0, "rerun the cell above to start a new experiment"
assert len(args.use_X) > 0 and len(args.use_models) > 0, "at least one pair of train/test sets and one model is required"

models, scores = cross_train(X, y, args.use_models)

Dataset: raw, Model: CNN2D, Training...
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 2s/step - accuracy: 0.5426 - loss: 1.7443
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 406ms/step
Performance on train set:
{'accuracy': 0.5686584711074829,
 'auc': 0.5705257312803523,
 'f1': 0.5587044534412956}
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 408ms/step
Performance on test set:
{'accuracy': 0.530063271522522,
 'auc': 0.5301937809824245,
 'f1': 0.5107084019769358}

Dataset: scaled, Model: CNN2D, Training...
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 2s/step - accuracy: 0.5090 - loss: 0.6936
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 412ms/step
Performance on train set:
{'accuracy': 0.520379900932312, 'auc': 0.5, 'f1': 0.6845393024466424}
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 396ms/step
Performance on test set:
{'accuracy': 0.5015822649002075, 'auc': 0.5, 'f1': 

In [9]:
scores

{'raw': {'CNN2D': {'train': {'accuracy': 0.5686584711074829,
    'f1': 0.5587044534412956,
    'auc': 0.5705257312803523},
   'test': {'accuracy': 0.530063271522522,
    'f1': 0.5107084019769358,
    'auc': 0.5301937809824245}}},
 'scaled': {'CNN2D': {'train': {'accuracy': 0.520379900932312,
    'f1': 0.6845393024466424,
    'auc': 0.5},
   'test': {'accuracy': 0.5015822649002075,
    'f1': 0.6680716543730243,
    'auc': 0.5}}},
 'minmax': {'CNN2D': {'train': {'accuracy': 0.5512465238571167,
    'f1': 0.6792986425339367,
    'auc': 0.5358619131875164},
   'test': {'accuracy': 0.5158227682113647,
    'f1': 0.6498855835240275,
    'auc': 0.5146161934805468}}}}