In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
from wettbewerb import load_references, get_3montages
import mne
from scipy import signal as sig
import ruptures as rpt
import json
import pywt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
training_folder = "../../shared_data/training_mini"

In [3]:
ids, channels, data, sampling_frequencies, reference_systems, eeg_labels = load_references(training_folder) 
# Importiere EKG-Dateien, zugehörige Diagnose, Sampling-Frequenz (Hz) und Name (meist fs=256 Hz)

100	 Dateien wurden geladen.


In [4]:
N_samples = 2000 # Numeber of samples per division
# Decompose the wave
wavelet = 'db4'
scaler = StandardScaler()
normalization = True
features = []
for i,_id in enumerate(ids):
    montage, montage_data, is_missing = get_3montages(channels[i], data[i])
    N_div = len(montage_data[0])//N_samples
    
    # Normalizing data
    if normalization:
        norm_montage0_data = scaler.fit_transform(montage_data[0].reshape(-1,1)).reshape(1,-1)[0]
        norm_montage1_data = scaler.fit_transform(montage_data[1].reshape(-1,1)).reshape(1,-1)[0]
        norm_montage2_data = scaler.fit_transform(montage_data[2].reshape(-1,1)).reshape(1,-1)[0]
    else:
        norm_montage0_data = montage_data[0]
        norm_montage1_data = montage_data[1]
        norm_montage2_data = montage_data[2]
    
    for i in range(N_div):
        features_per_div = np.zeros((15))
        montage0_array = norm_montage0_data[i*N_samples:(i+1)*N_samples]
        montage1_array = norm_montage1_data[i*N_samples:(i+1)*N_samples]
        montage2_array = norm_montage2_data[i*N_samples:(i+1)*N_samples]
        ca4, cd4, cd3, cd2, cd1 = pywt.wavedec(montage0_array, wavelet, level=4)
        montage0_dwt = [ca4, cd4, cd3, cd2, cd1]
        ca4, cd4, cd3, cd2, cd1 = pywt.wavedec(montage1_array, wavelet, level=4)
        montage1_dwt = [ca4, cd4, cd3, cd2, cd1]
        ca4, cd4, cd3, cd2, cd1 = pywt.wavedec(montage2_array, wavelet, level=4)
        montage2_dwt = [ca4, cd4, cd3, cd2, cd1]
        for w in range(len(montage0_dwt)):
            features_per_div[w] = np.sum(np.abs(np.diff(montage0_dwt[w])))/len(montage0_dwt[w]) 
            features_per_div[5+w] = np.sum(np.abs(np.diff(montage1_dwt[w])))/len(montage1_dwt[w])
            features_per_div[10+w] = np.sum(np.abs(np.diff(montage2_dwt[w])))/len(montage2_dwt[w])
        features.append(features_per_div)




In [5]:
labels = []
for i,_id in enumerate(ids):
    if eeg_labels[i][0]:
        onset = eeg_labels[i][1]
        offset = eeg_labels[i][2]
        sample_freq = sampling_frequencies[i]
        total_time = len(data[i][1])/sample_freq
        N_div = len(data[i][1])//N_samples
        for num in range(N_div):
            if (((total_time/N_div)*(num) <= onset) and ((total_time/N_div)*(num+1) > onset)) or (((total_time/N_div)*(num) >= onset) and ((total_time/N_div)*(num) < offset)):
                labels.append([1])
            else:
                labels.append([0])
    else:
        N_div = len(data[i][1])//N_samples
        for num in range(N_div):
            labels.append([0])
labels = np.reshape(labels, (1,-1))[0]
        

In [16]:
param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [8,15],
    'min_samples_leaf': [1, 2]
}

In [17]:
kf = KFold(n_splits=4, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring='accuracy', cv=kf)

In [18]:
grid_search.fit(features,labels)

GridSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(),
             param_grid={'max_depth': [8, 15], 'min_samples_leaf': [1, 2],
                         'n_estimators': [300, 500]},
             scoring='accuracy')

In [19]:
print(grid_search.best_params_)

{'max_depth': 15, 'min_samples_leaf': 1, 'n_estimators': 300}


In [9]:
print("Accuracy scores:", scores)
print("Mean accuracy:", scores.mean())

Accuracy scores: [0.968      0.97317647 0.97316384 0.96986817]
Mean accuracy: 0.9710521214135371


In [37]:
rf_classifier = rf = RandomForestClassifier(
    n_estimators=500,  # Number of trees in the forest
    max_features="sqrt",  # Number of features to consider at each split
    max_depth=8,  # Maximum depth of each tree
    min_samples_leaf=4,  # Minimum number of samples required to be at a leaf node
)

In [53]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_classifier, features, labels, cv=kf, scoring="accuracy")  # Replace rf with your model
print("Accuracy scores:", scores)
print("Mean accuracy:", scores.mean())

Accuracy scores: [0.96597243 0.97124413 0.97329812 0.97388498 0.967723  ]
Mean accuracy: 0.9704245321346283


In [None]:
rf_classifier.fit(features, labels)