Dieses Notebook untersucht die Performance des Modells, wenn nur Artefakte einer Klasse als Artefakt gelten.

In [1]:
import h5py
from utils.labeling import label_all_files

window_size_sec = 1.0
window_overlap = 0.0
overlap_treshold = 0.3
dir_path = "../../../../tuar_processed"
hdf5_path = "./features/features.hdf5"
classes = ['eyem', 'musc']

for artifact_class in classes:
    label_generator = label_all_files(dir_path, window_size_sec, window_overlap, overlap_treshold, [artifact_class])

    with h5py.File(hdf5_path, 'a') as hdf5_file:
        for session, label_dict in label_generator:
            session_group = hdf5_file.require_group(session)

            for channel, (data, labels) in label_dict.items():
                channel_group = session_group.require_group(channel)

                if f'labels_{artifact_class}' in channel_group:
                    del channel_group[f'labels_{artifact_class}']

                channel_group.create_dataset(f'labels_{artifact_class}', data=labels, compression="gzip", shuffle=True, chunks=True)

Labeling files in ../../../../tuar_processed: 0it [00:00, ?it/s]
Labeling files in ../../../../tuar_processed/train: 100%|██████████| 320/320 [00:29<00:00, 10.93it/s]
Labeling files in ../../../../tuar_processed/val: 100%|██████████| 98/98 [00:08<00:00, 11.53it/s]
Labeling files in ../../../../tuar_processed/test: 100%|██████████| 118/118 [00:10<00:00, 11.31it/s]
Labeling files in ../../../../tuar_processed: 0it [00:00, ?it/s]
Labeling files in ../../../../tuar_processed/train: 100%|██████████| 320/320 [00:29<00:00, 10.83it/s]
Labeling files in ../../../../tuar_processed/val: 100%|██████████| 98/98 [00:07<00:00, 13.68it/s]
Labeling files in ../../../../tuar_processed/test: 100%|██████████| 118/118 [00:09<00:00, 12.88it/s]


In [4]:
import h5py

with h5py.File('features/features.hdf5') as f:
    print(f['aaaaapas_s005_t000']['FP2-F4']['labels_eyem'][579:600])

[1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


Scheint geklappt zu haben. Nun wird ein Modell auf diesen Daten trainiert.

In [6]:
from utils.training import get_features_and_labels
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
from datetime import datetime


label = 'labels_eyem'
split_val = 'val'
split_train = 'train'
feature_file = "./features/features.hdf5"
data_split_file = "./data_split.yaml"
features = ['mean', 'variance', 'std', 'ptp_amp', 'kurtosis', 'quantile', 'pow_freq_bands', 'hurst_exp', 'line_length',
            'rms', 'higuchi_fd', 'spect_entropy', 'svd_entropy', 'teager_kaiser_energy', 'wavelet_coef_energy',
            'zero_crossings']

random_state = 42
max_depth = 25
class_weight = 'balanced'
model_save_path = f'./models/{datetime.now().strftime('%d-%m-%y %H-%M-%S')}.joblib'

X_train, y_train = get_features_and_labels(feature_file, features, split_train, data_split_file, label)
X_val, y_val = get_features_and_labels(feature_file, features, split_val, data_split_file, label)
clf = RandomForestClassifier(n_jobs=-1, class_weight=class_weight, max_depth=max_depth, random_state=random_state)
clf.fit(X_train, y_train)

joblib.dump(clf, model_save_path)

y_pred = clf.predict(X_val)

print(f'Classification report on validation set:\n\n{classification_report(y_val, y_pred)}')


Extracting features and labels for sessions: 100%|██████████| 268/268 [00:06<00:00, 39.96it/s]
Extracting features and labels for sessions: 100%|██████████| 268/268 [00:01<00:00, 149.29it/s]


Classification report on validation set:

              precision    recall  f1-score   support

           0       0.97      0.96      0.96   1129523
           1       0.24      0.30      0.27     52603

    accuracy                           0.93   1182126
   macro avg       0.60      0.63      0.61   1182126
weighted avg       0.93      0.93      0.93   1182126



In [7]:
from utils.training import get_features_and_labels
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
from datetime import datetime


label = 'labels_musc'
split_val = 'val'
split_train = 'train'
feature_file = "./features/features.hdf5"
data_split_file = "./data_split.yaml"
features = ['mean', 'variance', 'std', 'ptp_amp', 'kurtosis', 'quantile', 'pow_freq_bands', 'hurst_exp', 'line_length',
            'rms', 'higuchi_fd', 'spect_entropy', 'svd_entropy', 'teager_kaiser_energy', 'wavelet_coef_energy',
            'zero_crossings']

random_state = 42
max_depth = 25
class_weight = 'balanced'
model_save_path = f'./models/{datetime.now().strftime('%d-%m-%y %H-%M-%S')}.joblib'

X_train, y_train = get_features_and_labels(feature_file, features, split_train, data_split_file, label)
X_val, y_val = get_features_and_labels(feature_file, features, split_val, data_split_file, label)
clf = RandomForestClassifier(n_jobs=-1, class_weight=class_weight, max_depth=max_depth, random_state=random_state)
clf.fit(X_train, y_train)

joblib.dump(clf, model_save_path)

y_pred = clf.predict(X_val)

print(f'Classification report on validation set:\n\n{classification_report(y_val, y_pred)}')


Extracting features and labels for sessions: 100%|██████████| 268/268 [00:05<00:00, 53.38it/s]
Extracting features and labels for sessions: 100%|██████████| 268/268 [00:01<00:00, 179.54it/s]


Classification report on validation set:

              precision    recall  f1-score   support

           0       0.98      0.90      0.94   1118489
           1       0.29      0.70      0.41     63637

    accuracy                           0.89   1182126
   macro avg       0.64      0.80      0.67   1182126
weighted avg       0.94      0.89      0.91   1182126

