## Preprocessing of Data and Augmentation
- smote_"name of file" e.g. smote_C4M1 (type = numpy array) for normalized data augmentation with SMOTE (Synthetic Minority Oversampling Technique)
- gauss_data_"name of file" e.g gauss_data_C3M2 (type = numpy array) for normaliezed data augmentation with gaussian noise

In [54]:
import numpy as np
import os
import pandas as pd
from sklearn import preprocessing
import torch
from skimage.util import random_noise
from imblearn.over_sampling import SMOTE 

## Read in Data:

In [2]:
#list all files of patient x
dirname = '../data/Exercises_SS22/sleeplab_dataset_10hz/patient_29_male_7_years'
readings = []
names = []

for filename in os.listdir(dirname):
    f = os.path.join(dirname, filename)
    x = f.replace('\\', '/')
    readings.append(x)
    f = filename.replace('.csv','')
    names.append(f)



In [3]:
measurements = [pd.read_csv(i, skiprows=1, names=[names[ix]]) for ix, i in enumerate(readings[:-1])]
label = pd.read_csv(readings[-1], usecols=['Schlafstadium'])
data = pd.concat(measurements, axis=1)

In [4]:
converted_label = label.replace(['WK', 'REM', 'N1', 'N2', 'N3'], [0, 1, 2, 3, 4])
converted_label

Unnamed: 0,Schlafstadium
0,0
1,0
2,0
3,0
4,0
...,...
1087,0
1088,0
1089,0
1090,0


In [5]:
normalized_df=(data-data.mean())/data.std()

In [6]:
normalized_df

Unnamed: 0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
0,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
1,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
2,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
3,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
4,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
...,...,...,...,...,...,...,...,...,...,...,...
327305,0.272937,0.379179,0.284994,0.397626,0.213423,-0.017849,0.735155,-0.475123,0.009262,0.321457,1.367751
327306,-0.246661,-0.361474,0.245397,0.333454,1.378628,0.529919,0.828369,0.250002,0.301864,0.417714,0.696438
327307,0.013138,0.502621,0.938339,1.017954,-0.174979,0.996536,1.154616,0.572280,0.260064,-0.376409,0.482838
327308,0.143037,0.132294,0.819549,0.419017,1.767030,0.935673,0.688549,0.491710,0.239164,-0.135765,0.116667


## Augmentation

In [52]:
#gaussian noise
mu, sigma = 0, 0.1
noise = np.random.normal(mu, sigma, normalized_df.shape) 
print(noise)

[[ 0.04691545 -0.0857807  -0.11220943 ... -0.01691034 -0.10955099
  -0.00667789]
 [ 0.06415512 -0.02397481  0.08261708 ... -0.08421746 -0.13257039
  -0.05271552]
 [ 0.05432276 -0.0369073  -0.09142435 ...  0.09884541 -0.16400839
  -0.08226225]
 ...
 [-0.07844628 -0.1255608   0.0417223  ...  0.04754584 -0.12772097
   0.04374802]
 [-0.07211789  0.13340585  0.11092657 ... -0.00969148 -0.00387397
  -0.08351316]
 [-0.01482754 -0.05896844  0.0219793  ...  0.1596599  -0.01685927
   0.07947358]]


In [53]:
gauss_data = normalized_df + noise
gauss_data

Unnamed: 0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
0,0.060053,-0.076928,-0.005401,0.042041,0.211136,-0.046369,0.200746,0.192887,-0.362950,0.043456,0.628731
1,0.077293,-0.015123,0.189426,-0.167320,-0.078431,-0.033025,0.087234,0.152202,-0.430257,0.020436,0.582694
2,0.067461,-0.028055,0.015384,-0.196573,0.065638,-0.115377,0.111245,0.147219,-0.247194,-0.011002,0.553147
3,0.051997,-0.135014,0.118791,0.014082,0.120809,-0.066406,0.001541,0.076899,-0.205593,-0.061046,0.665649
4,-0.069476,-0.086993,0.011221,-0.229728,-0.052946,-0.070358,0.129642,0.019053,-0.306259,0.280593,0.603602
...,...,...,...,...,...,...,...,...,...,...,...
327305,0.288318,0.368765,0.406961,0.398130,0.183515,0.056307,0.803740,-0.413918,0.172532,0.338025,1.423307
327306,-0.249078,-0.470124,0.177696,0.208722,1.458434,0.646191,1.008028,0.208094,0.308148,0.519677,0.795329
327307,-0.065308,0.377060,0.980061,1.019871,-0.124315,0.981979,1.061116,0.661562,0.307610,-0.504130,0.526586
327308,0.070919,0.265700,0.930476,0.531317,1.755369,0.746124,0.699890,0.655913,0.229472,-0.139639,0.033153


In [7]:
segments = np.array([[i] * 300 for i in range(len(converted_label))]).flatten()[:normalized_df.shape[0]]

tuples = list(zip(segments, normalized_df.index))

index = pd.MultiIndex.from_tuples(tuples, names=["Samples", "Datapoints"])
multi_index_df = normalized_df.set_index(index)

In [8]:
multi_index_df

Unnamed: 0_level_0,Unnamed: 1_level_0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
Samples,Datapoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,1,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,2,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,3,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,4,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
...,...,...,...,...,...,...,...,...,...,...,...,...
1091,327305,0.272937,0.379179,0.284994,0.397626,0.213423,-0.017849,0.735155,-0.475123,0.009262,0.321457,1.367751
1091,327306,-0.246661,-0.361474,0.245397,0.333454,1.378628,0.529919,0.828369,0.250002,0.301864,0.417714,0.696438
1091,327307,0.013138,0.502621,0.938339,1.017954,-0.174979,0.996536,1.154616,0.572280,0.260064,-0.376409,0.482838
1091,327308,0.143037,0.132294,0.819549,0.419017,1.767030,0.935673,0.688549,0.491710,0.239164,-0.135765,0.116667


In [9]:
counted = multi_index_df.groupby(level=0).count()

In [10]:
smallSampleIndices = counted.loc[counted.BeinLi_10HZ < 300].index
if len(smallSampleIndices) > 0:
    multi_index_df = multi_index_df.drop(smallSampleIndices)
    converted_label = converted_label.drop(smallSampleIndices)

In [65]:
multi_index_df

Unnamed: 0_level_0,Unnamed: 1_level_0,BeinLi_10HZ,BeinRe_10HZ,C3M2_10HZ,C4M1_10HZ,EMG_10HZ,F3M2_10HZ,F4M1_10HZ,LEOGM2_10HZ,O1M2_10HZ,O2M1_10HZ,REOGM1_10HZ
Samples,Datapoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,1,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,2,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,3,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
0,4,0.013138,0.008852,0.106808,-0.115749,0.019222,-0.078712,0.105964,0.129148,-0.346040,0.153007,0.635409
...,...,...,...,...,...,...,...,...,...,...,...,...
1090,327295,-0.246661,0.008852,0.284994,0.675704,0.019222,-0.443891,0.781762,-3.718044,-0.074338,0.297393,3.930950
1090,327296,0.013138,0.132294,-0.546537,0.419017,0.990226,-1.052522,0.269088,-3.577047,-0.157938,0.706486,2.588322
1090,327297,0.662634,0.255737,-0.704924,0.333454,-0.951782,-1.356837,0.432211,-3.234627,-0.680441,0.104878,2.679865
1090,327298,-0.506459,0.132294,-0.249562,0.825439,-0.563381,-0.484466,0.921583,-2.207366,-0.304239,0.369586,2.374722


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_ix, test_ix = train_test_split(multi_index_df.index.levels[0][:-1], random_state=42)

In [14]:
train_X = multi_index_df.loc[train_ix]
train_y = converted_label.loc[train_ix]
test_X = multi_index_df.loc[test_ix]
test_y = converted_label.loc[test_ix]

In [15]:
train_X.reset_index(0)[train_X.reset_index(0).Samples==92].values
list(set(train_X.reset_index(0).Samples))[10]

15

In [56]:
train_y

Unnamed: 0_level_0,Schlafstadium
Samples,Unnamed: 1_level_1
92,3
696,4
704,4
760,1
294,3
...,...
330,3
466,3
121,4
1044,3


In [16]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, label_df):
        self.dataframe = dataframe
        self.label_df = label_df

    def __len__(self):
        return len(self.label_df)

    def __getitem__(self, idx):
        output = torch.tensor(self.dataframe.loc[list(set(self.dataframe.reset_index(0).Samples))[idx]].values.astype(np.float32))
        label = self.label_df.iloc[idx].values[0]
        return output, label

In [17]:
train_dataset = CustomDataset(train_X, train_y)
train_dataset.__getitem__(500)

(tensor([[-0.5065, -0.2380, -1.2989,  ...,  0.3437,  0.3937,  0.1777],
         [-0.6364, -0.1146, -0.2100,  ..., -0.0743, -0.0876, -0.9818],
         [-0.2467,  0.2557, -0.9029,  ..., -0.2206,  0.5380, -0.5852],
         ...,
         [ 0.4028, -0.2380,  0.6612,  ..., -0.5759,  0.0808,  0.2692],
         [ 0.1430, -0.3615,  0.2850,  ..., -0.3042, -0.4967,  0.0251],
         [-0.3766,  0.0089,  1.0571,  ..., -0.3251, -0.3042,  0.0556]]),
 3)

In [63]:
train_dataset = CustomDataset(train_X, train_y)
test_dataset = CustomDataset(test_X, test_y)

In [19]:
import torch.nn as nn
import torch.nn.functional as F

class ImageClassificationBase(nn.Module):
    
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_acc']))

In [20]:
class NaturalSceneClassification(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            
            nn.Conv2d(1, 32, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.Conv2d(32,64, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
        
            nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(128 ,128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(256,256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Flatten(),
            nn.Softmax()
        )
    
    def forward(self, xb):
        return self.network(xb)

In [21]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

  
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, val_loader, opt_func = torch.optim.SGD):
    
    history = []
    optimizer = opt_func(model.parameters(),lr)
    for epoch in range(epochs):
        
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    
    return history

In [26]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

model = NaturalSceneClassification()

num_epochs = 30
opt_func = torch.optim.Adam
lr = 0.001#fitting the model on training data and record the result after each epoch
history = fit(num_epochs, lr, model, train_dataloader, test_dataloader, opt_func)

ValueError: Expected input batch_size (256) to match target batch_size (1).