# Setup

In [1]:
# Project Root directory
project_root = ".."

In [2]:
import random, os
import numpy as np

if "pipeline" not in os.listdir():
    os.chdir(project_root)
    assert "pipeline" in os.listdir()

from pipeline.preprocessing import \
    build_feature_extractor, \
    TCDPdata, \
    gen_datesets, \
    cross_train

# constant
dataset_root = "assets/the-circor-digiscope-phonocardiogram-dataset-1.0.3"

# Variables

In [10]:
from DL.models import MultiScaleCNN

class args:
    
    cutoff_frequency = 2000 # use 0 to disable bandpass filter

    use_features = [
        "mel_2d"
    ]

    use_X = [
        "raw",
        "scaled",
        "minmax",
    ]

    train_size = 0.8

    random_state = 2024

    use_models = {
        "CNN2D": {
            "class": MultiScaleCNN,
            "kwargs": {
                "input_shape": [128, None, 1],  # (n_mels, time_frames, channels)
                "num_classes": 2,
                "learning_rate": 0.001,
                "epochs": 10,
                "batch_size": 32,
            }
        }
    }

    def set_n_time_frames(n_time_frames: int):
        args.use_models["CNN2D"]["kwargs"]['input_shape'][1] = n_time_frames

random.seed(args.random_state)
np.random.seed(args.random_state)

# Extract Features

In [4]:
extract_features = build_feature_extractor(
    args.use_features,
    args.cutoff_frequency
)
features, labels = TCDPdata(dataset_root).getXy(extract_features)

## defensive checking
# assert len(labels) == 3159
# assert sum(labels) == 1632
print('n features:', features.shape[1])
print('mean of labels:', labels.mean())

100%|██████████| 3159/3159 [02:40<00:00, 19.64it/s]


n features: 128
mean of labels: 0.51661918328585


# Generate Dataset

In [5]:
# features matrix has 3 dimensions: 0 for files, 1 for channels, 2 for time
# We want to normalize each channel(axis=1) separately,
# so the normalizer will move along axis 0 and 2. (normalize_axis=(0, 2))
X, y = gen_datesets(features, labels, args.use_X, args.train_size, args.random_state,
                    normalize_axis=(0, 2))

for x_type in X.keys():
    print(x_type, X[x_type]['train'].shape, X[x_type]['test'].shape)

raw (2527, 128, 505) (632, 128, 505)
scaled (2527, 128, 505) (632, 128, 505)
minmax (2527, 128, 505) (632, 128, 505)


In [14]:
n_time_frames = X['raw']['train'].shape[2]
args.set_n_time_frames(n_time_frames)
print(f"Model has aligned its input layer with n_time_frames: {n_time_frames}")

Model has aligned its input layer with n_time_frames: 505


# Train Models

In [15]:
"""
In case you run the next cell accidently,
which can make you lose all the data.
You need to run the cell first before the next one.
"""
models = {}
scores = {}

In [16]:
assert len(scores) == len(models) == 0, "rerun the cell above to start a new experiment"
assert len(args.use_X) > 0 and len(args.use_models) > 0, "at least one pair of train/test sets and one model is required"

models, scores = cross_train(X, y, args.use_models)

Dataset: raw, Model: CNN2D, Training...
Epoch 1/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 1s/step - accuracy: 0.5146 - loss: 1.5779
Epoch 2/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 1s/step - accuracy: 0.5469 - loss: 0.6864
Epoch 3/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 1s/step - accuracy: 0.5715 - loss: 0.6736
Epoch 4/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 1s/step - accuracy: 0.5868 - loss: 0.6619
Epoch 5/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 1s/step - accuracy: 0.6292 - loss: 0.6395
Epoch 6/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 1s/step - accuracy: 0.6347 - loss: 0.6314
Epoch 7/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 1s/step - accuracy: 0.6330 - loss: 0.6398
Epoch 8/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1627s[0m 21s/step - accuracy: 0.6609 - loss: 0.5814
Epoch 

In [17]:
scores

{'raw': {'CNN2D': {'train': {'accuracy': 0.7273446917533875,
    'f1': 0.7428144830160508,
    'auc': 0.7260992734254414},
   'test': {'accuracy': 0.5443037748336792,
    'f1': 0.5596330275229358,
    'auc': 0.5441990886785839}}},
 'scaled': {'CNN2D': {'train': {'accuracy': 0.6098139882087708,
    'f1': 0.6863867684478372,
    'auc': 0.600860219101758},
   'test': {'accuracy': 0.5822784900665283,
    'f1': 0.66497461928934,
    'auc': 0.5815031796104351}}},
 'minmax': {'CNN2D': {'train': {'accuracy': 0.7867035865783691,
    'f1': 0.7906796116504855,
    'auc': 0.7872372598476578},
   'test': {'accuracy': 0.5458860993385315,
    'f1': 0.5378421900161031,
    'auc': 0.545946622602774}}}}