# Setup

In [8]:
import random, os
import numpy as np


if "pipeline" not in os.listdir():
    os.chdir("..")

from pipeline.preprocessing import \
    build_feature_extractor, \
    TCDPdata, \
    gen_datesets, \
    cross_train

# constant
dataset_root = "assets/the-circor-digiscope-phonocardiogram-dataset-1.0.3"

# Variables

In [21]:
from sklearn.neural_network import MLPClassifier

class args:
    
    cutoff_frequency = 2000 # use 0 to disable bandpass filter

    use_features = [
        "chromagram",
        "melspectrogram",
        "mfcc",
        "csv"
    ]

    use_X = [
        "raw",
        "scaled",
        "minmax"
    ]

    train_size = 0.8

    random_state = 2024

    use_models = {
        "MLP": {
            "class": MLPClassifier,
            "kwargs": {
                "hidden_layer_sizes": (
                    100,
                    200,
                    # 200,
                    # 200
                ),
                "activation": 'logistic',
                "max_iter": 200,
                "random_state": random_state
            }
        }
    }

random.seed(args.random_state)
np.random.seed(args.random_state)

# Extract Features

In [10]:
# import pandas as pd

# # pd.DataFrame(features).to_csv("./assets/feature.csv", header=False, index=False)
# _, labels = TCDPdata(dataset_root).getXy(lambda _: None)
# features = np.array(pd.read_csv("./assets/feature.csv", header=None))

# assert len(labels) == 3159
# assert sum(labels) == 1632
# print('n features:', features.shape[1])

In [11]:
features, labels = TCDPdata(dataset_root).getXy(build_feature_extractor(args.use_features, args.cutoff_frequency))

assert len(labels) == 3159
assert sum(labels) == 1632
print('n features:', features.shape[1])

  0%|          | 0/3159 [00:00<?, ?it/s]

100%|██████████| 3159/3159 [03:45<00:00, 14.01it/s]

n features: 254





# Generate Dataset

In [12]:
X, y = gen_datesets(features, labels, args.use_X, args.train_size, args.random_state)
print(*X.keys())

raw scaled minmax


# Train Models

In [22]:
"""
In case you run the next cell accidently,
which can make you lose all the data.
You need to run the cell first before the next one.
"""
models = {}
scores = {}

In [23]:
assert len(scores) == len(models) == 0, "rerun the cell above to start a new experiment"
assert len(args.use_X) > 0 and len(args.use_models) > 0, "at least one pair of train/test sets and one model is required"

models, scores = cross_train(X, y, args.use_models)

Dataset: raw, Model: MLP, Training...




Performance on train set:
{'accuracy': 1.0, 'auc': 1.0, 'f1': 1.0}
Performance on test set:
{'accuracy': 0.6329113924050633,
 'auc': 0.632807570977918,
 'f1': 0.6452599388379204}

Dataset: scaled, Model: MLP, Training...




Performance on train set:
{'accuracy': 1.0, 'auc': 1.0, 'f1': 1.0}
Performance on test set:
{'accuracy': 0.6629746835443038,
 'auc': 0.6628961994892595,
 'f1': 0.6718027734976888}

Dataset: minmax, Model: MLP, Training...
Performance on train set:
{'accuracy': 0.6838148001582904,
 'auc': 0.685631329292625,
 'f1': 0.6784708249496981}
Performance on test set:
{'accuracy': 0.6186708860759493,
 'auc': 0.6188122778028141,
 'f1': 0.6016528925619835}

