# Setup

In [1]:
import random, os
import numpy as np


if "pipeline" not in os.listdir():
    os.chdir("..")

from pipeline.preprocessing import \
    build_feature_extractor, \
    TCDPdata, \
    gen_datesets, \
    cross_train

# constant
dataset_root = "assets/the-circor-digiscope-phonocardiogram-dataset-1.0.3"

# Variables

In [2]:
from sklearn.neural_network import MLPClassifier

class args:
    
    cutoff_frequency = 2000 # use 0 to disable bandpass filter

    use_features = [
        "chromagram",
        "melspectrogram",
        "mfcc",
        "csv"
    ]

    use_X = [
        "raw",
        "scaled",
        "minmax"
    ]

    train_size = 0.8

    random_state = 2024

    use_models = {
        "MLP": {
            "class": MLPClassifier,
            "kwargs": {
                "hidden_layer_sizes": (
                    100,
                    200,
                    # 200,
                    # 200
                ),
                "activation": 'logistic',
                "max_iter": 200,
                "random_state": random_state
            }
        }
    }

random.seed(args.random_state)
np.random.seed(args.random_state)

# Extract Features

In [3]:
# import pandas as pd

# # pd.DataFrame(features).to_csv("./assets/feature.csv", header=False, index=False)
# _, labels = TCDPdata(dataset_root).getXy(lambda _: None)
# features = np.array(pd.read_csv("./assets/feature.csv", header=None))

## defensive checking
# assert len(labels) == 3159
# assert sum(labels) == 1632
# print('n features:', features.shape[1])

In [4]:
features, labels = TCDPdata(dataset_root).getXy(build_feature_extractor(args.use_features, args.cutoff_frequency))

## defensive checking
# assert len(labels) == 3159
# assert sum(labels) == 1632
print('n features:', features.shape[1])
print('mean of labels:', features.shape[1])

100%|██████████| 3159/3159 [05:46<00:00,  9.13it/s]

n features: 254
mean of labels: 254





# Generate Dataset

In [5]:
X, y = gen_datesets(features, labels, args.use_X, args.train_size, args.random_state, normalize_axis=0)


for x_type in X.keys():
    print(x_type, X[x_type]['train'].shape, X[x_type]['test'].shape)

raw (2527, 254) (632, 254)
scaled (2527, 254) (632, 254)
minmax (2527, 254) (632, 254)


# Train Models

In [6]:
"""
In case you run the next cell accidently,
which can make you lose all the data.
You need to run the cell first before the next one.
"""
models = {}
scores = {}

In [7]:
assert len(scores) == len(models) == 0, "rerun the cell above to start a new experiment"
assert len(args.use_X) > 0 and len(args.use_models) > 0, "at least one pair of train/test sets and one model is required"

models, scores = cross_train(X, y, args.use_models)

Dataset: raw, Model: MLP, Training...




Performance on train set:
{'accuracy': 1.0, 'auc': 1.0, 'f1': 1.0}
Performance on test set:
{'accuracy': 0.6455696202531646,
 'auc': 0.6454058384657754,
 'f1': 0.6636636636636636}

Dataset: scaled, Model: MLP, Training...




Performance on train set:
{'accuracy': 1.0, 'auc': 1.0, 'f1': 1.0}
Performance on test set:
{'accuracy': 0.6645569620253164,
 'auc': 0.6644835010765611,
 'f1': 0.6728395061728395}

Dataset: minmax, Model: MLP, Training...
Performance on train set:
{'accuracy': 0.6838148001582904,
 'auc': 0.685631329292625,
 'f1': 0.6784708249496981}
Performance on test set:
{'accuracy': 0.6186708860759493,
 'auc': 0.6188122778028141,
 'f1': 0.6016528925619835}



In [8]:
scores

{'raw': {'MLP': {'train': {'accuracy': 1.0, 'f1': 1.0, 'auc': 1.0},
   'test': {'accuracy': 0.6455696202531646,
    'f1': 0.6636636636636636,
    'auc': 0.6454058384657754}}},
 'scaled': {'MLP': {'train': {'accuracy': 1.0, 'f1': 1.0, 'auc': 1.0},
   'test': {'accuracy': 0.6645569620253164,
    'f1': 0.6728395061728395,
    'auc': 0.6644835010765611}}},
 'minmax': {'MLP': {'train': {'accuracy': 0.6838148001582904,
    'f1': 0.6784708249496981,
    'auc': 0.685631329292625},
   'test': {'accuracy': 0.6186708860759493,
    'f1': 0.6016528925619835,
    'auc': 0.6188122778028141}}}}