In [74]:
# -*- coding: utf-8 -*-



import csv
import matplotlib.pyplot as plt
import numpy as np
import os
from wettbewerb import load_references, get_3montages
import mne
from scipy import signal as sig
import ruptures as rpt
import json
import pywt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import pandas as pd

def plot_roc_curve(fprs, tprs):
    """Plot the Receiver Operating Characteristic from a list
    of true positive rates and false positive rates."""
    
    # Initialize useful lists + the plot axes.
    tprs_interp = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    f, ax = plt.subplots(figsize=(14,10))
    
    # Plot ROC for each K-Fold + compute AUC scores.
    for i, (fpr, tpr) in enumerate(zip(fprs, tprs)):
        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))
        tprs_interp[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        
    # Plot the luck line.
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)
    
    # Plot the mean ROC.
    mean_tpr = np.mean(tprs_interp, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)
    
    # Plot the standard deviation around the mean ROC.
    std_tpr = np.std(tprs_interp, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')
    
    # Fine tune and show the plot.
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic')
    ax.legend(loc="lower right")
    plt.show()
    return (f, ax)

def compute_roc_auc(index):
    y_predict = rf_classifier.predict_proba(features[index])[:,1]
    fpr, tpr, thresholds = roc_curve(labels[index], y_predict)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, auc_score

In [2]:
### if __name__ == '__main__':  # bei multiprocessing auf Windows notwendig

training_folder  = "../training"


print('Loading Dataset')
ids, channels, data, sampling_frequencies, reference_systems, eeg_labels = load_references(training_folder) # Importiere EEG-Dateien, zugehörige Kanalbenennung, Sampling-Frequenz (Hz) und Name (meist fs=256 Hz), sowie Referenzsystem
print('Dataset loaded')

Loading Dataset
1018	 Dateien wurden geladen.
Dataset loaded


In [77]:
N_div = 2 # Numeber of subdivisions
# Decompose the wave
wavelet = 'db4'
dataset_montage_line_length_array = np.zeros((len(ids),15*N_div))
for i,_id in enumerate(ids):
    montage, montage_data, is_missing = get_3montages(channels[i], data[i])
    montage_line_length_array = np.zeros((15*N_div))
    for y in range(N_div):
        for j, signal_name in enumerate(montage):
            temp = np.zeros((N_div*5))
            ca4, cd4, cd3, cd2, cd1 = pywt.wavedec(montage_data[j], wavelet, level=4)
            dwt_array = [ca4, cd4, cd3, cd2, cd1]
            for w in range(len(dwt_array)):
                array_divided = np.array_split(dwt_array[w],N_div)
                
                    # temp[(y+(w*N_div))] = np.sum(np.abs(np.diff(array_divided[y])))/len(array_divided[y])
            montage_line_length_array[(j*N_div*5):(j*N_div*5)+N_div*5] = temp
                
    dataset_montage_line_length_array[i] = montage_line_length_array

features = dataset_montage_line_length_array

In [65]:
labels = np.zeros((len(eeg_labels),N_div))
for i,_id in enumerate(ids):
    if eeg_labels[i][0]:
        onset = eeg_labels[i][1]
        offset = eeg_labels[i][2]
        sample_freq = sampling_frequencies[i]
        total_time = len(data[i][1])/sample_freq
        for num in range(N_div):
            if (total_time/N_div)*(num+1) >= onset and (total_time/N_div)*(num+1) <= offset:
                labels[i][num] = 1
        
        

#labels = np.array(eeg_labels, dtype=int)[:,0]

In [72]:
rf_classifier = rf = RandomForestClassifier(
    n_estimators=500,  # Number of trees in the forest
    max_features="sqrt",  # Number of features to consider at each split
    max_depth=8,  # Maximum depth of each tree
    min_samples_leaf=4,  # Minimum number of samples required to be at a leaf node
)

cv = KFold(n_splits=5, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []

In [73]:
for (train, test), i in zip(cv.split(features, labels), range(5)):
    rf_classifier.fit(features[train], labels[train])
    _, _, auc_score_train = compute_roc_auc(train)
    fpr, tpr, auc_score = compute_roc_auc(test)
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

TypeError: list indices must be integers or slices, not tuple

In [75]:




results = cross_val_score(rf_classifier, features, labels, cv=cv)
print(results.mean())

0.6326427122573166
