## Preprocessing of Data and Augmentation
- smote_"name of file" e.g. smote_C4M1 (type = numpy array) for normalized data augmentation with SMOTE (Synthetic Minority Oversampling Technique)
- gauss_data_"name of file" e.g gauss_data_C3M2 (type = numpy array) for normaliezed data augmentation with gaussian noise

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras.layers import GaussianNoise
from skimage.util import random_noise
import torch
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from collections import Counter
import plotly.express as px

import tensorflow_ranking as tfr

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape, Conv1DTranspose, Conv2DTranspose, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, GRU, Softmax
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from tensorflow.keras import models, layers, utils, backend as K
from tensorflow.keras.optimizers import Adam

## Read in Data:

In [2]:
#list all files of patient x
dirname = '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_29_male_7_years'
readings = []
names = []

for filename in os.listdir(dirname):
    f = os.path.join(dirname, filename)
    x = f.replace('\\', '/')
    readings.append(x)
    f = filename.replace('.csv','')
    names.append(f)



In [3]:
measurements = [pd.read_csv(i, skiprows=1, names=[names[ix]]) for ix, i in enumerate(readings[:-1])]
label = pd.read_csv(readings[-1], usecols=['Schlafstadium'])
data = pd.concat(measurements, axis=1)

In [4]:
converted_label = label.replace(['WK', 'REM', 'N1', 'N2', 'N3'], [0, 1, 2, 3, 4])
converted_label

Unnamed: 0,Schlafstadium
0,0
1,0
2,0
3,0
4,0
...,...
1087,0
1088,0
1089,0
1090,0


In [5]:
normalized_df=(data-data.mean())/data.std()

In [6]:
normalized_df

Unnamed: 0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
0,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
1,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
2,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
3,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
4,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
...,...,...,...,...,...,...,...,...,...,...,...
327305,0.272937,0.379179,0.284994,0.397626,0.213423,-0.017849,0.735155,-0.475123,0.009262,0.321457,1.367751
327306,-0.246661,-0.361474,0.245397,0.333454,1.378628,0.529919,0.828369,0.250002,0.301864,0.417714,0.696438
327307,0.013138,0.502621,0.938339,1.017954,-0.174979,0.996536,1.154616,0.572280,0.260064,-0.376409,0.482838
327308,0.143037,0.132294,0.819549,0.419017,1.767030,0.935673,0.688549,0.491710,0.239164,-0.135765,0.116667


In [7]:
segments = np.array([[i] * 300 for i in range(len(converted_label))]).flatten()[:normalized_df.shape[0]]

tuples = list(zip(segments, normalized_df.index))

index = pd.MultiIndex.from_tuples(tuples, names=["Samples", "Datapoints"])
multi_index_df = normalized_df.set_index(index)

In [8]:
counted = multi_index_df.groupby(level=0).count()

In [9]:
smallSampleIndices = counted.loc[counted.BeinLi_10HZ < 300].index
if len(smallSampleIndices) > 0:
    multi_index_df = multi_index_df.drop(smallSampleIndices)
    converted_label = converted_label.drop(smallSampleIndices)

In [10]:
multi_index_df.index.levels[0]

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091],
           dtype='int64', name='Samples', length=1092)

In [11]:
from sklearn.model_selection import train_test_split

In [14]:
train_ix, test_ix = train_test_split(multi_index_df.index.levels[0][:-1], random_state=42)

In [19]:
train_X = multi_index_df.loc[train_ix]
train_y = converted_label.loc[train_ix]
test_X = multi_index_df.loc[test_ix]
test_y = converted_label.loc[test_ix]

In [40]:
train_X

Unnamed: 0_level_0,Unnamed: 1_level_0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
Samples,Datapoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
92,27600,-0.246661,-0.114590,-0.209965,-0.607734,-0.563381,-1.093097,-1.385453,-0.434838,-0.659541,-0.544859,-0.859790
92,27601,-0.246661,-0.238032,0.700759,-0.458000,-0.369180,0.793659,-0.476621,0.350714,-0.764042,-1.266789,-1.012361
92,27602,0.272937,0.255737,0.205800,-1.420579,-0.174979,0.205316,-1.175723,0.552138,-1.307445,-0.641116,-1.592132
92,27603,0.143037,0.132294,0.245397,-0.458000,-1.145983,0.428481,-0.406711,0.612565,-1.809048,-0.641116,-1.103904
92,27604,0.792534,-0.114590,1.948055,1.189080,0.213423,1.929770,1.550774,1.478686,-0.325139,0.465843,-0.646190
...,...,...,...,...,...,...,...,...,...,...,...,...
860,258295,-0.766258,0.996390,0.205800,0.525970,-0.951782,-0.241014,0.665245,-0.092418,-0.011638,0.923065,0.330266
860,258296,-1.026056,7.045057,0.106808,-0.137140,0.796025,-0.403315,0.105964,-0.676547,0.427265,0.297393,-0.280019
860,258297,0.143037,-0.855243,-0.982101,-0.351047,0.407624,-1.559714,-0.453318,-2.005943,-0.429640,0.802744,0.635409
860,258298,-0.506459,0.626063,-0.447545,-0.586344,-0.369180,-0.849645,-0.663048,-1.260675,-0.199739,0.850872,0.391295


In [38]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, label_df):
        self.dataframe = dataframe
        self.label_df = label_df

    def __len__(self):
        return len(self.label_df)

    def __getitem__(self, idx):
        output = torch.tensor(self.dataframe.iloc[idx].values.astype(np.float32))
        label = self.label_df.iloc[idx].values
        return output, label

In [39]:
dataset = CustomDataset(train_X, train_y)
dataset.__getitem__(1)

(tensor([[-0.2467, -0.2380, -0.5069,  ...,  0.1974, -0.0395, -5.5285],
         [ 0.0131,  0.2557,  0.0276,  ...,  0.2601,  0.5380, -2.9348],
         [ 0.7925, -0.1146,  0.2256,  ..., -0.1788, -0.2080, -1.3175],
         ...,
         [ 2.8709, -1.2256, -6.0901,  ..., -6.8878,  7.6851,  6.2195],
         [-9.7293,  1.3667, -6.0901,  ..., -6.8878,  5.6637, 10.1864],
         [-2.4549, -0.7318, -6.0901,  ..., -6.8878,  7.6851,  0.4828]]),
 array([0], dtype=int64))