In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [2]:
from data_split import DataSplitter

## Parameters

In [3]:
# Trials to exclude for extra verifications
trials2drop = ['1_11', '1_8', '1_18', '2_16', '3_30']

# dataset ratio (total = 1.0)
ratio_test = 0.2
ratio_train = 0.7
ratio_vali = round(1.0 - ratio_test - ratio_train, 2)
print(f'ratio: (train, validation, test) = ({ratio_train}, {ratio_vali}, {ratio_test}) ')

# Seed for trial selection
seedNo = 5#np.random.randint(0, 2**32)
print(f'seedNo.{seedNo}')

# Chennel 
chNo = 1

ratio: (train, validation, test) = (0.7, 0.1, 0.2) 
seedNo.5


## (1) Load the info (i.e. metadata)

In [4]:
# Load the labels
path_meta = '/Volumes/Sandisk_SD/Work/IZFP/Laser/InSignA_campaign2/Code/segmented_data/test'
metadf = pd.read_csv('{}/metadata.csv'.format(path_meta), sep = ',', header = 0)
# All labels extracted from the metadata
y = np.array(metadf.classID, dtype=int)

In [5]:
# Extract the relevant information
classes = list(metadf['class'].unique())# ['noGap', 'Gap0.1', 'Gap0.2', 'Gap0.3', 'noise']
classIDs = sorted(list(metadf['classID'].unique())) # make sure the IDs match the class names
T_seg = list(metadf['T_seg[ms]'].unique())

## (2) Split the trials 
Trials are split into (a) training set, (b) validation set and (c) test set.
With the current version, data split is done in the following steps:

* (1) Split all relevant trials into two groups: (a)+(b) and (c) <br>
    -> outputs: two lists <br>
        * list of trials for train and validation  = (a)+(b) <br>
        * list of trials for test = (c)<br> 
* (2) Split (a)+(b) into (a) and (b) <br>
    -> outputs: two lists <br>
        * list of trials for train = (a)<br>
        * list of trials for validation = (b) <br>
* (3) Reset the splitter <br>
    -> at the moment a reset is required to get data IDs for the selected trials <br>
    => just set splitter.metadf = metadf again

In [6]:
# Sepcify the data size for training / test data
# Excluded trials found in the metadata
trials2drop_meta = metadf.trial.unique()[np.argwhere(np.isin(metadf.trial.unique(), test_elements=trials2drop)).flatten()]
# All valid trials
N = len(metadf.trial.unique()) - len(trials2drop_meta)

print(f'All trials = {len(metadf.trial.unique())}')
print(f'Excluded trials in the metadata = {trials2drop_meta}')
print(f'Valid trials = {N}')


All trials = 60
Excluded trials in the metadata = ['2_16' '3_30']
Valid trials = 58


### (2-1) Split into train+vali and test

In [7]:
# Instantiate
splitter = DataSplitter()
splitter.metadf = metadf

# Return the list of trials
trials_rest, trials_test = splitter.trials_split(
    ratio=ratio_test, 
    ret_ID=False,
    trials2drop=trials2drop, 
    seed=seedNo
)


DataSplietter: metadf is modified and dropped some trials!


### (2-2) Split into train and validation

In [8]:
# Return the list of trials
trials_train, trials_vali = splitter.trials_split(
    ratio=round(ratio_vali/ratio_train, 5), 
    ret_ID=False,
    trials2drop=trials_test,
    seed=seedNo
)


DataSplietter: metadf is modified and dropped some trials!


In [9]:
print('Trials for training')
print(trials_train)
print('Trials for validation')
print(trials_vali)
print('Trials for test')
print(trials_test)

Trials for training
['1_4' '1_7' '1_9' '1_13' '1_16' '1_21' '1_22' '1_23' '1_24' '1_25' '1_26'
 '1_27' '2_6' '2_8' '2_9' '2_10' '2_12' '2_14' '2_17' '2_23' '2_26' '2_27'
 '2_30' '2_33' '2_43' '3_4' '3_6' '3_8' '3_10' '3_11' '3_13' '3_17' '3_18'
 '3_19' '3_20' '3_23' '3_24' '3_25' '3_27' '3_28' '3_32']
Trials for validation
['1_1' '2_24' '2_29' '3_3' '3_12' '3_16']
Trials for test
['1_3' '1_6' '2_2' '2_5' '2_22' '2_25' '2_31' '2_41' '3_7' '3_9' '3_26']


### (2-3) Reset the splitter

In [10]:
# Reset the meta dataframe
splitter.metadf = metadf

## (3) Load test set segments
-> use **all** segments

In [11]:
id_test = splitter.get_dataIDs(trials_test)

print('*** Test ***')
print(f'{len(trials_test)} trials, in total = {len(id_test)} segments')
print(f'trials = {trials_test}')
print(f'segments = {id_test[:5]}...')

*** Test ***
11 trials, in total = 352 segments
trials = ['1_3' '1_6' '2_2' '2_5' '2_22' '2_25' '2_31' '2_41' '3_7' '3_9' '3_26']
segments = [32 33 34 35 36]...


## (4) Load validation set segments
-> use **all** segments

In [12]:
id_vali = splitter.get_dataIDs(trials_vali)

print('*** Validation ***')
print(f'{len(trials_vali)} trials, in total = {len(id_vali)} segments')
print(f'trials = {trials_vali}')
print(f'segments = {id_vali[:5]}...')

*** Validation ***
6 trials, in total = 192 segments
trials = ['1_1' '2_24' '2_29' '3_3' '3_12' '3_16']
segments = [0 1 2 3 4]...


## (4) Load training set segments
-> select segments randomly, while keeping the same data size for all classes <br>
Why? Because some classes have more valid segments than other classes. <br>
If you want to use all training segments, repeat the same procedure as (3) and (4). <br>
(Don't forget to shuffle the output though)

In [13]:
# (1) Select some segemens randomly
# -> output is "sorted" according to the classes (segments of class 0 appear at the beginning) 
id_train = splitter.select_segments_balanced_class(trials_train, N_seg_class = 50, seed=30)

# (2) Shuffle, if necessary 
rng = np.random.default_rng(20)
id_train = rng.permutation(id_train)

# Training set
print(f'From {len(trials_train)} trials:') 
print(f'{trials_train} \n')
print(f'Total segments selected = {len(id_train)}')

for curr_ID in classIDs:
    print(f'class {curr_ID} = {len(np.argwhere(y[id_train]==curr_ID))}, segs = {id_train[curr_ID*50:curr_ID*50+10]}...')

From 41 trials:
['1_4' '1_7' '1_9' '1_13' '1_16' '1_21' '1_22' '1_23' '1_24' '1_25' '1_26'
 '1_27' '2_6' '2_8' '2_9' '2_10' '2_12' '2_14' '2_17' '2_23' '2_26' '2_27'
 '2_30' '2_33' '2_43' '3_4' '3_6' '3_8' '3_10' '3_11' '3_13' '3_17' '3_18'
 '3_19' '3_20' '3_23' '3_24' '3_25' '3_27' '3_28' '3_32'] 

Total segments selected = 150
class 0 = 50, segs = [ 195  246 1553  354  704  686  398  934 1048  626]...
class 1 = 50, segs = [ 790 1026  786   93 1807  779  176 1636 1820  356]...
class 2 = 50, segs = [ 447 1898 1571  332 1539  589  694 1729  640  181]...


## (5) Load the input data for each set

In [14]:
def load_signal(fileID, path, chNo,
                *args, **kwargs
                ):
    # (1) Load
    fname = f'ch{chNo}/data_{fileID[0]}.npy'
    data = np.load(f'{path}/{fname}')
    # (2) you can do any preprocessing here, just for an example sake, I just cut the signal here
    data_proc = data[:1000]
    return data_proc

In [15]:
# Data path
path = '/Volumes/Sandisk_SD/Work/IZFP/Laser/InSignA_campaign2/Code/segmented_data/test'

# Loading
X_train = np.apply_along_axis(
    func1d = load_signal, 
    axis = 0, 
    arr = id_train[np.newaxis, ...], 
    path = path, 
    chNo = chNo, 
)

X_vali = np.apply_along_axis(
    func1d = load_signal, 
    axis = 0, 
    arr = id_vali[np.newaxis, ...], 
    path = path, 
    chNo = chNo, 
)

X_test = np.apply_along_axis(
    func1d = load_signal, 
    axis = 0, 
    arr = id_test[np.newaxis, ...], 
    path = path, 
    chNo = chNo, 
)

print('Input data are loaded!')

Input data are loaded!


## (6) Load the labels

In [16]:
y_train = y[id_train]
y_vali = y[id_vali]
y_test = y[id_test]

In [17]:
# Check the labels of the training set
y_train[:150]

array([0, 2, 0, 0, 0, 1, 1, 1, 2, 0, 2, 2, 1, 2, 0, 2, 2, 2, 2, 0, 2, 2,
       1, 2, 2, 0, 2, 0, 0, 0, 1, 2, 2, 2, 2, 0, 1, 0, 0, 2, 1, 2, 0, 1,
       2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 1, 1,
       2, 2, 1, 2, 0, 1, 2, 2, 1, 1, 0, 1, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2,
       1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 2, 0, 0, 1, 0, 1, 2, 0, 0, 2,
       2, 1, 0, 1, 2, 1, 1, 1, 1, 2, 1, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2,
       2, 0, 2, 1, 2, 2, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1])