In [1]:
%load_ext autoreload
%autoreload 2
import netCDF4
import xarray as xr
from pathlib import Path
from hmpai.pytorch.models import *
from hmpai.training import split_participants
from hmpai.pytorch.utilities import set_global_seed
from hmpai.pytorch.generators import MultiXArrayProbaDataset
from hmpai.pytorch.normalization import *
from hmpai.pytorch.transforms import *
from hmpai.pytorch.mamba import *
from hmpai.pytorch.training import train_and_test

from mne.io import read_info
import os
DATA_PATH = Path(os.getenv("DATA_PATH"))

In [2]:
# Load in t1, t2 and cmb
t1_path = DATA_PATH / "prp/stage_data_250hz_t1.nc"
t2_path = DATA_PATH / "prp/stage_data_250hz_t2.nc"
cmb_path = DATA_PATH / "prp/Data_trial_250Hz.nc"

t1 = xr.open_dataset(t1_path)
t2 = xr.open_dataset(t2_path)
cmb = xr.open_dataset(cmb_path)

In [None]:
# Goal: Create new file that contains EEG from cmb, and HMP data from t2 appended to t1, combining labels
SHORT_DELAY = 300 #ms
LONG_DELAY = 1200 #ms
SAMPLING_FREQUENCY = 250 #hz

# Manual testing showed that actualy delay in EEG data is this value - 6 (69 and 294 respectively), see delay_analysis.ipynb
SHORT_PADDING = int((SHORT_DELAY / 1000) * SAMPLING_FREQUENCY) - 6
LONG_PADDING = int((LONG_DELAY / 1000) * SAMPLING_FREQUENCY) - 6

(21, 1315, 5, 500)
(21, 1308, 3, 499)


In [34]:
t1_epochs = t1.trial_index.values
t2_epochs = t2.trial_index.values
cmb_epochs = cmb.trial_index.values
t1_epochs = np.nan_to_num(t1_epochs, nan=-1).astype(int)
t2_epochs = np.nan_to_num(t2_epochs, nan=-1).astype(int)
cmb_epochs = np.nan_to_num(cmb_epochs, nan=-1).astype(int)

In [59]:
shp = cmb.data.shape
new_probas = np.zeros((shp[0], shp[1], t1.probabilities.data.shape[2] + t2.probabilities.data.shape[2] - 1, shp[3] + LONG_PADDING))

In [81]:
for i in range(t1_epochs.max() + 1):
    # For each possible epoch
    # Get indices for each participant where this epoch was valid
    t1_idx = np.where(t1_epochs == i)
    t1_participants = t1_idx[0].tolist()
    t2_idx = np.where(t2_epochs == i)
    t2_participants = t2_idx[0].tolist()
    cmb_idx = np.where(cmb_epochs == i)
    cmb_participants = cmb_idx[0].tolist()
    
    for p_t1_idx, p_t1 in enumerate(t1_participants):
        for p_t2_idx, p_t2 in enumerate(t2_participants):
            for p_cmb_idx, p_cmb in enumerate(cmb_participants):
                if not (p_t1 == p_t2 == p_cmb):
                    continue
                # Participant + actual epoch combo occurs in both tasks! Should be combined
                t1_epoch = t1_idx[1][p_t1_idx] # p_idx should be the same across all sets
                t2_epoch = t2_idx[1][p_t2_idx]
                cmb_epoch = cmb_idx[1][p_cmb_idx]
                
                t1_data = t1.isel(participant=p_t1, epochs=t1_epoch)
                t2_data = t2.isel(participant=p_t2, epochs=t2_epoch)
                if not t1_data.condition == t2_data.condition:
                    print('Conditions not the same, abort')
                    break
                padding = SHORT_PADDING if t1_data.condition == 'short' else LONG_PADDING
                
                new_probas[p_t1, cmb_epoch, :t1_data.probabilities.data.shape[0], :500] = t1_data.probabilities.data
                new_probas[p_t1, cmb_epoch, t1_data.probabilities.data.shape[0]:, padding:padding+499] = t2_data.probabilities.data[1:]
# Cut off at 500 samples / 2 sec since cmb data only goes for 2 s
new_probas = new_probas[...,:500]

In [None]:
cmb = cmb.assign(probabilities=(('participant', 'epochs', 'labels', 'samples'), new_probas))

In [93]:
cmb.to_netcdf(DATA_PATH / "prp/stage_data_250hz_combined.nc")