# Setup

In [1]:
import random, os
import numpy as np


if "pipeline" not in os.listdir():
    os.chdir("..")

from pipeline.preprocessing import \
    build_feature_extractor, \
    TCDPdata, \
    gen_datesets, \
    cross_train

# constant
dataset_root = "assets/the-circor-digiscope-phonocardiogram-dataset-1.0.3"

# Variables

In [19]:
from sklearn.neural_network import MLPClassifier

class args:
    
    cutoff_frequency = 2000 # use 0 to disable bandpass filter

    use_features = [
        "chromagram",
        "melspectrogram",
        "mfcc",
        "csv"
    ]

    use_X = [
        "raw",
        "scaled",
        "minmax"
    ]

    train_size = 0.8

    random_state = 2024

    use_models = {
        "MLP": {
            "class": MLPClassifier,
            "kwargs": {
                "hidden_layer_sizes": (
                    150,
                    150,
                    100,
                    100
                ),
                "activation": 'logistic',
                "max_iter": 100,
                "random_state": random_state
            }
        }
    }

random.seed(args.random_state)
np.random.seed(args.random_state)

# Extract Features

In [3]:
# import pandas as pd

# # pd.DataFrame(features).to_csv("./assets/feature.csv", header=False, index=False)
# _, labels = TCDPdata(dataset_root).getXy(lambda _: None)
# features = np.array(pd.read_csv("./assets/feature.csv", header=None))

# assert len(labels) == 3159
# assert sum(labels) == 1632
# print('n features:', features.shape[1])

In [4]:
features, labels = TCDPdata(dataset_root).getXy(build_feature_extractor(args.use_features, args.cutoff_frequency))

assert len(labels) == 3159
assert sum(labels) == 1632
print('n features:', features.shape[1])

  0%|          | 0/3159 [00:00<?, ?it/s]

100%|██████████| 3159/3159 [04:02<00:00, 13.01it/s]

n features: 254





# Generate Dataset

In [5]:
X, y = gen_datesets(features, labels, args.use_X, args.train_size, args.random_state)
print(*X.keys())

raw scaled minmax


# Train Models

In [20]:
"""
In case you run the next cell accidently,
which can make you lose all the data.
You need to run the cell first before the next one.
"""
models = {}
scores = {}

In [21]:
assert len(scores) == len(models) == 0, "rerun the cell above to start a new experiment"
assert len(args.use_X) > 0 and len(args.use_models) > 0, "at least one pair of train/test sets and one model is required"

models, scores = cross_train(X, y, args.use_models)

Dataset: raw, Model: MLP, Training...




Performance on train set:
{'accuracy': 0.999604273842501,
 'auc': 0.9995874587458745,
 'f1': 0.999619916381604}
Performance on test set:
{'accuracy': 0.6123417721518988,
 'auc': 0.612222722948275,
 'f1': 0.6270928462709284}

Dataset: scaled, Model: MLP, Training...




Performance on train set:
{'accuracy': 0.9928769291650178,
 'auc': 0.9929943279499053,
 'f1': 0.9931350114416476}
Performance on test set:
{'accuracy': 0.6503164556962026,
 'auc': 0.6501777577487358,
 'f1': 0.6656580937972769}

Dataset: minmax, Model: MLP, Training...
Performance on train set:
{'accuracy': 0.6913335971507717,
 'auc': 0.6839049304169962,
 'f1': 0.7449313276651407}
Performance on test set:
{'accuracy': 0.6313291139240507,
 'auc': 0.6306894997746734,
 'f1': 0.6938239159001315}

