# Single Patient Timesplit: figuring out what to do

In [2]:
import os
import sys
sys.path.append(os.path.abspath('../..'))
# from tabluence.data.api.smartwatch.utilities.preparations import *
import pandas
import plotly_express as px
import numpy
import json
import pickle
import gzip
import matplotlib.pyplot as plt
from copy import deepcopy
from functools import reduce
from tqdm import tqdm
from datetime import datetime, timezone
from typing import Dict, List, Tuple, Union, Any, Iterator
import torch.utils.data.dataloader
from tabluence.data.api.smartwatch.utilities.timestamp import get_utc_date_from_utc_timestamp
from tabluence.data.api.smartwatch.data_manager.module import SmartwatchDataManager
from tabluence.deep_learning.data.dataset.smartwatch_study.single_slice import get_dataloaders, SmartwatchStudySingleSliceDataset, single_slice_collate_fn
from tabluence.deep_learning.data.pipeline.fusion.single_slice import SliceToSliceFusion

Building the dataloaders

In [3]:
dataloaders = get_dataloaders(
    batch_size=50,
    root_dir='../../resources/warrior_wellness/Analysis/local_repo/',
    data_manager_cache_filepath='../../resources/smartwatch_study/dataset_cache/window_1hour_stride_1second-datamanger001.pkl.gz',
    subject_splits={
        "train": [f'SWS_{i:02d}' for i in range(0,10)],
        "test": [f'SWS_{i:02d}' for i in range(10,15)]},
    dataset_config={
        'slice_lengths': [3600],
        'slice_time_step': 1,
        'label_milestone_per_window': 1.0,
        'metadata_cache_filepath': '../../resources/smartwatch_study/dataset_cache/window_1hour_stride_1second.pkl.gz',
        'no_cache': False,
        'parallel_threads': 10,
        'specific_stress_quantization_bins': [0.0, 0.5, 10.0],
        'overall_stress_quantization_bins': [0.0, 0.5, 10.0]
    },
    sampler_configs=dict(
       train=dict(
           negative_sample_count=1000,
            positive_sample_count=500,
            target_variable='overall_quantized_stress_value',
           split_name="train"
       ),
       test=dict(
               negative_sample_count=200,
            positive_sample_count=100,
            target_variable='overall_quantized_stress_value',
           split_name="test"
       )
        )
)

2022-08-01 17:21:38,372 - tabluence.deep_learning.data.dataset.smartwatch_study.single_slice.interface - INFO - initializing data manager...
2022-08-01 17:21:40,238 - tabluence.deep_learning.data.dataset.smartwatch_study.single_slice.interface - INFO - preparing the dataset...
2022-08-01 17:21:44,704 - tabluence.deep_learning.data.dataset.smartwatch_study.single_slice.dataset - INFO - 
	~> processing the metadata for building quantization based label layout
100%|██████████████████████████████| 1036509/1036509 [01:06<00:00, 15674.98it/s]
2022-08-01 17:22:50,832 - tabluence.deep_learning.data.dataset.smartwatch_study.single_slice.dataset - INFO - 
		~> label layout for general_quantized_stress_value stress category: [0.0, 0.5]
2022-08-01 17:22:50,832 - tabluence.deep_learning.data.dataset.smartwatch_study.single_slice.dataset - INFO - 
		~> label layout for induced_quantized_stress_value stress category: [0.0, 0.5]
2022-08-01 17:22:50,833 - tabluence.deep_learning.data.dataset.smartwatch

Getting access to the data manager object:

In [4]:
data_manager = dataloaders['train'].dataset.data_manager

Now for one of the subjects, we get the time-range for which we have data:

In [5]:
[t_start, t_end] = data_manager.get_utc_timestamp_range_for_subject('SWS_13')

This timeline for this subject will be split in an 80-20 ratio:

In [6]:
train_window = (t_start, t_start + 0.8 * (t_end - t_start))
test_window = (0.8 * (t_end - t_start), t_end)

In [9]:
number_of_windows = dict(train=0, test=0)

We can see an example batch metadata using the following code:

In [10]:
for meta in dataloaders['train'].dataset.metadata:
    break

In [11]:
meta

{'subject_id': 'SWS_01',
 'utc_timestamp_window': (1614924000.0, 1614927600.0),
 'overall_stress_value': 0.0,
 'general_stress_value': 0.0,
 'interpersonal_stress_value': 0.0,
 'utc_timestamp_for_stress_query': 1614927600.0,
 'induced_stress_value': 0.0,
 'general_quantized_stress_value': 0.0,
 'induced_quantized_stress_value': 0.0,
 'interpersonal_quantized_stress_value': 0.0,
 'overall_quantized_stress_value': 0.0}

In [12]:
len(dataloaders['train'].dataset.metadata)

1036509

The following function is the core idea of considering overlap between to intervals. The idea is that we do not want windows that overlap between test duration and train time span.

In [13]:
def get_overlap(x1, x2):
    if x1[1] <= x2[0] or x2[1] <= x1[0]:
        return 0
    intersection = min(x2[1], x1[1]) - max(x2[0], x1[0])
    total = max(x2[1], x1[1]) - min(x2[0], x1[0])
    
    return intersection

Here is an example:

In [14]:
get_overlap((1, 10), (5, 12))

5

Thus, we are interested ONLY in windows that fall "entirely" in their corresponding segments:

In [15]:
output = dict()

for subject_id in [f'SWS_{e:02d}' for e in range(1, 15) if not e == 10]:
    output[subject_id] = dict(train=[], test=[])
    t_start, t_end = data_manager.get_utc_timestamp_range_for_subject(subject_id)
    train_window = (t_start, t_start + 0.8 * (t_end - t_start))
    for meta in dataloaders['train'].dataset.metadata:
        if meta['subject_id'] == subject_id:
            intersection = get_overlap(meta['utc_timestamp_window'], train_window)
            if (intersection / (float(meta['utc_timestamp_window'][1] - meta['utc_timestamp_window'][0]))) == 1.0:
                output[subject_id]['train'].append(meta)
            elif (intersection / (float(meta['utc_timestamp_window'][1] - meta['utc_timestamp_window'][0]))) == 0.0:
                output[subject_id]['test'].append(meta)
            else:
                pass

Let's check the number:

In [16]:
len(output['SWS_02']['train'])

34725

In [17]:
for batch in dataloaders['train']:
    break

We can also look at a single batch slice, as it has information on the following data sources:

In [18]:
batch['slice'][0].keys()

dict_keys(['daily', 'respiration', 'stress', 'pulseOx'])

We can take a look at the subjects in each partition:

In [19]:
train_subject_ids = set()
for batch in tqdm(dataloaders['train']):
    train_subject_ids = train_subject_ids.union(set(
        [e['subject_id'] for e in batch['meta']]))

100%|███████████████████████████████████████████| 30/30 [00:09<00:00,  3.30it/s]


In [20]:
train_subject_ids

{'SWS_01',
 'SWS_02',
 'SWS_03',
 'SWS_04',
 'SWS_05',
 'SWS_06',
 'SWS_07',
 'SWS_08',
 'SWS_09'}

Example data source information:

In [21]:
batch['slice'][0]['stress'].head()

Unnamed: 0,utc_timestamp,stress_level_tsvalue,durationInSeconds,user_id,startTimeInSeconds,summaryId,startTimeOffsetInSeconds,calendarDate,utc_date,body_battery_tsvalue
5600,1620404280,-1,60540,SWS_05,1620370800,x3a513ce-6094e570-ec7c,-25200,2021-05-07,2021-05-07 16:18:00+00:00,54.0
5601,1620404460,-1,60540,SWS_05,1620370800,x3a513ce-6094e570-ec7c,-25200,2021-05-07,2021-05-07 16:21:00+00:00,54.0
5602,1620404640,-1,60540,SWS_05,1620370800,x3a513ce-6094e570-ec7c,-25200,2021-05-07,2021-05-07 16:24:00+00:00,54.0
5603,1620404820,-2,60540,SWS_05,1620370800,x3a513ce-6094e570-ec7c,-25200,2021-05-07,2021-05-07 16:27:00+00:00,54.0
5604,1620405000,-2,60540,SWS_05,1620370800,x3a513ce-6094e570-ec7c,-25200,2021-05-07,2021-05-07 16:30:00+00:00,54.0


Let's fuse this information into one, and try an early-fusion model on it:

In [22]:
fusion = SliceToSliceFusion(
    config={
                'timestamp_column': 'utc_timestamp',
                'sources': {
                    'all_timeseries': {
                        'daily': ['heart_rate_tsvalue'],
                        'pulseOx': ['spo2_tsvalue'],
                        'respiration': ['epoch_to_breath_tsvalue'],
                        'stress': ['stress_level_tsvalue']
                    }
                },
                'nan_fill_method': ['ffill', 'bfill', 'fill_constant_0']
            }
)

In [23]:
fused_batch = fusion(batch)

In [24]:
fused_batch['slice'][0].keys()

dict_keys(['all_timeseries'])

In [25]:
fused_batch['slice'][0]['all_timeseries'].head()

Unnamed: 0,utc_timestamp,heart_rate_tsvalue,spo2_tsvalue,epoch_to_breath_tsvalue,stress_level_tsvalue
0,1620404160,75,95.0,14.23,-1.0
1,1620404175,77,95.0,14.23,-1.0
2,1620404190,77,95.0,14.23,-1.0
3,1620404205,77,95.0,14.23,-1.0
4,1620404220,77,95.0,14.53,-1.0


In [26]:
from tabluence.deep_learning.pipeline.model import EarlyFusedSingleRNNSliceModel
from tabluence.deep_learning.data.tensorizer import CustomTensorizer

In [27]:
tensorizer = CustomTensorizer(
    config=dict(
        timestamp_column='utc_timestamp',
        value_config=dict(
            all_timeseries=dict(
                bring=[
                    'heart_rate_tsvalue',
                    'spo2_tsvalue',
                    'epoch_to_breath_tsvalue',
                    'stress_level_tsvalue'
                ]
            ),
        )),
    device=torch.device('cpu')
)

In [28]:
t_batch = tensorizer(fused_batch)

The fused and tensorized input would look like this:

In [30]:
t_batch['slice'][4]['all_timeseries'].shape

torch.Size([240, 4])

In [33]:
assert not torch.stack([torch.isnan(e['all_timeseries']).any() for e in t_batch['slice']]).any().item()

In [34]:
model = EarlyFusedSingleRNNSliceModel(
    tensorizer=tensorizer,
    config=dict(
        single_source='all_timeseries',
        main_rnn=dict(
            rnn_model="LSTM",
            rnn_args=dict(
                input_size=4,
                hidden_size=32,
                bidirectional=True,
                batch_first=True,
                bias=False,
                dropout=0.2
            ),
            project_args=dict(
                input_dim=64,
                projection_dim=32
            ),  # will be projected to this dimension if not None.
        ),
        task=dict(
            target_in_meta='overall_quantized_stress_value',
            type='classification',
            loss_class='CrossEntropyLoss',
            loss_args=dict(),
            label_layout=[0, 0.5, 10]
        )
    ))

  "num_layers={}".format(dropout, num_layers))


In [35]:
model(fused_batch, mode='train')

{'model_outputs': {'latent_representations': tensor([[-0.0096,  0.5900, -0.1067,  ..., -0.2954,  0.1868,  0.1734],
          [ 0.1070,  0.4004, -0.7262,  ...,  0.1173,  0.3949, -0.2548],
          [ 0.0186,  0.1601, -0.2656,  ..., -0.2858, -0.2714,  0.0910],
          ...,
          [-0.2599,  0.3213, -0.0394,  ..., -0.5897, -0.1214,  0.5590],
          [ 0.2039,  0.5406, -0.2500,  ..., -0.0175,  0.1893, -0.1058],
          [-0.0972,  0.5198,  0.0722,  ..., -0.1392,  0.0211,  0.3065]],
         grad_fn=<AddmmBackward0>),
  'logits': tensor([[-0.1088, -0.3289,  0.1109],
          [ 0.6571, -1.0732,  0.4330],
          [ 0.2694,  0.0743,  0.6358],
          [ 0.7517, -0.9526,  0.1351],
          [ 0.0844, -0.1456,  0.9376],
          [ 0.1084, -1.3155,  0.5628],
          [-0.1592, -0.9316,  0.4992],
          [-0.1427, -1.2851,  0.3928],
          [ 0.0633, -0.4916,  0.7749],
          [-0.3571, -1.2318, -0.0147],
          [-0.1199, -0.8002,  0.5573],
          [ 0.6839, -0.9750,  0.38