## Preprocessing of Data and Augmentation
- smote_"name of file" e.g. smote_C4M1 (type = numpy array) for normalized data augmentation with SMOTE (Synthetic Minority Oversampling Technique)
- gauss_data_"name of file" e.g gauss_data_C3M2 (type = numpy array) for normaliezed data augmentation with gaussian noise

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import preprocessing
import torch

## Read in Data:

In [2]:
dirname = "../data/Exercises_SS22/sleeplab_dataset_10hz"
folders = []

for folder in os.listdir(dirname):
    f = os.path.join(dirname, folder)
    x = f.replace('\\', '/')
    folders.append(x)

folders

['../data/Exercises_SS22/sleeplab_dataset_10hz/patient_29_male_7_years',
 '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_75_female_5_years',
 '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_80_female_5_years',
 '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_89_female_6_years',
 '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_91_female_7_years']

In [22]:
readings = []
names = []

for filename in os.listdir(folders[0]):
    f = os.path.join(folders[0], filename)
    x = f.replace('\\', '/')
    readings.append(x)
    f = filename.replace('.csv','')
    names.append(f)
    
measurements = [pd.read_csv(i, skiprows=1, names=[names[ix]]) for ix, i in enumerate(readings[:-1])]
label = pd.read_csv(readings[-1], usecols=['Schlafstadium'])
data = pd.concat(measurements, axis=1)
converted_label = label.replace(['WK', 'REM', 'N1', 'N2', 'N3'], [0, 1, 2, 3, 4])
normalized_df=(data-data.mean())/data.std()
segments = np.array([[i] * 300 for i in range(len(converted_label))]).flatten()[:normalized_df.shape[0]]

tuples = list(zip(segments, normalized_df.index))

index = pd.MultiIndex.from_tuples(tuples, names=["Samples", "Datapoints"])
multi_index_df = normalized_df.set_index(index)
counted = multi_index_df.groupby(level=0).count()
smallSampleIndices = counted.loc[counted.BeinLi_10HZ < 300].index
if len(smallSampleIndices) > 0:
    multi_index_df = multi_index_df.drop(smallSampleIndices)
    converted_label = converted_label.drop(smallSampleIndices)

In [23]:
readings_2 = []
names_2 = []


for filename in os.listdir(folders[1]):
    f = os.path.join(folders[1], filename)
    x = f.replace('\\', '/')
    readings_2.append(x)
    f = filename.replace('.csv','')
    names_2.append(f)
    # create single df
    
#read data
measurements_2 = [pd.read_csv(i, skiprows=1, names=[names_2[ix]]) for ix, i in enumerate(readings_2[:-1])]
label_2 = pd.read_csv(readings_2[-1], usecols=['Schlafstadium'])
label_2.index += 1091
# concat all single files
data_2 = pd.concat(measurements_2, axis=1)
# convert labels to ints
converted_label_2 = label_2.replace(['WK', 'REM', 'N1', 'N2', 'N3'], [0, 1, 2, 3, 4])
# normalize the data, because some paper said to do so
normalized_df_2 = (data_2-data_2.mean())/data_2.std()
segments_2 = np.array([[i] * 300 for i in range(len(converted_label_2))]).flatten()[:normalized_df_2.shape[0]]
segments_2 += 1091
normalized_df_2.index += 1091
# create multi index dataframe
tuples_2 = list(zip(segments_2, normalized_df_2.index))

index_2 = pd.MultiIndex.from_tuples(tuples_2, names=["Samples", "Datapoints"])
multi_index_df_2 = normalized_df_2.set_index(index_2)
# drop the sample, that isn't big enough
counted_2 = multi_index_df_2.groupby(level=0).count()
smallSampleIndices_2 = counted_2.loc[counted_2.BeinLi_10HZ < 300].index
if len(smallSampleIndices_2) > 0:
    multi_index_df_2 = multi_index_df_2.drop(smallSampleIndices_2)
    converted_label_2 = converted_label_2.drop(smallSampleIndices_2)



In [16]:
multi_index_df

Unnamed: 0_level_0,Unnamed: 1_level_0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
Samples,Datapoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1091,1091,0.009903,0.001735,0.091553,-0.038920,0.017304,-0.067191,0.074941,0.104402,-0.320144,0.137338,0.526640
1091,1092,0.009903,0.001735,0.091553,-0.038920,0.017304,-0.067191,0.074941,0.104402,-0.320144,0.137338,0.526640
1091,1093,0.009903,0.001735,0.091553,-0.038920,0.017304,-0.067191,0.074941,0.104402,-0.320144,0.137338,0.526640
1091,1094,0.009903,0.001735,0.091553,-0.038920,0.017304,-0.067191,0.074941,0.104402,-0.320144,0.137338,0.526640
1091,1095,0.009903,0.001735,0.091553,-0.038920,0.017304,-0.067191,0.074941,0.104402,-0.320144,0.137338,0.526640
...,...,...,...,...,...,...,...,...,...,...,...,...
2311,367386,0.335723,-1.267304,-0.064566,0.185251,2.523777,0.040725,0.281393,0.362348,0.519440,0.055299,-0.367360
2311,367387,-0.424523,-0.088911,-0.162140,0.092491,0.184402,-0.252190,-0.585706,-0.042996,0.500783,0.000607,-0.866948
2311,367388,0.552936,-0.179556,-0.045051,-0.031190,-0.149794,-0.159690,-0.441189,0.030703,0.407496,0.246723,-0.577713
2311,367389,0.227116,0.001735,-0.493892,-0.123951,-1.319482,-0.329273,-0.812803,-0.208818,0.202264,-0.327549,-0.682889


In [29]:
double_frame = pd.concat([multi_index_df, multi_index_df_2])
double_label = pd.concat([converted_label, converted_label_2])

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_ix, test_ix = train_test_split(double_frame.index.levels[0][:-1], random_state=42)

In [30]:
train_X = double_frame.loc[train_ix]
train_y = double_label.loc[train_ix]
test_X = double_frame.loc[test_ix]
test_y = double_label.loc[test_ix]

In [32]:
train_X.reset_index(0)[train_X.reset_index(0).Samples==92].values

11

In [33]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, label_df):
        self.dataframe = dataframe
        self.label_df = label_df

    def __len__(self):
        return len(self.label_df)

    def __getitem__(self, idx):
        output = torch.tensor(self.dataframe.loc[list(set(self.dataframe.reset_index(0).Samples))[idx]].values.astype(np.float32)).unsqueeze(0)
        label = self.label_df.loc[list(set(self.dataframe.reset_index(0).Samples))[idx]].values[0]
        return output, label

In [34]:
train_dataset = CustomDataset(train_X, train_y)
test_dataset = CustomDataset(test_X, test_y)

In [35]:
import torch.nn as nn
import torch.nn.functional as F

class ClassificationBase(nn.Module):
    
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_acc']))

In [36]:
class SleepClassification(ClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 20, kernel_size = 3, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(20),
            nn.Conv2d(20, 40, kernel_size = 3, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2,2),
        
            nn.Conv2d(40, 80, kernel_size = 3, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(80),
            nn.Conv2d(80 ,80, kernel_size = 3, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Conv2d(80, 120, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(120),
            nn.Conv2d(120,120, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Conv2d(120, 160, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(160),
            nn.Conv2d(160,160, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Conv2d(160, 240, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(240),
            nn.Conv2d(240,240, kernel_size = 2, stride = 1, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Flatten(),
            nn.Sigmoid()
        )
    
    def forward(self, xb):
        return self.network(xb)

In [37]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

  
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, val_loader, opt_func = torch.optim.SGD):
    
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    
    return history

In [38]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

model = SleepClassification()

num_epochs = 10
opt_func = torch.optim.RAdam
lr = 0.001#fitting the model on training data and record the result after each epoch
history = fit(num_epochs, lr, model, train_dataloader, test_dataloader, opt_func)

Epoch [0], train_loss: 8.2700, val_loss: 8.1243, val_acc: 0.2582
Epoch [1], train_loss: 8.1020, val_loss: 8.0893, val_acc: 0.2303
Epoch [2], train_loss: 8.0813, val_loss: 8.0765, val_acc: 0.2237
Epoch [3], train_loss: 8.0747, val_loss: 8.0740, val_acc: 0.2188
Epoch [4], train_loss: 8.0736, val_loss: 8.0737, val_acc: 0.2089


KeyboardInterrupt: 