#### Initialize CV

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
import hypopredict.cv as cv
from hypopredict.params import TRAIN_DAYS
from hypopredict.core.person import Person

In [5]:
# hand-picked TEST days and DEMO days are held out already
# make sure the number of days is divisible by K for K-fold CV
non_test_days = TRAIN_DAYS.copy()

In [6]:
non_test_days

[11,
 13,
 14,
 21,
 22,
 24,
 42,
 43,
 44,
 51,
 52,
 54,
 61,
 63,
 71,
 72,
 74,
 82,
 92,
 93]

In [7]:
ECG_PATH = os.getenv('ECG_PATH')
ECG_PATH

'/Users/alexxela/code/hypopredict/data/feathers/'

In [8]:
splitter = cv.CV_splitter(n_splits=4,
                       random_state=17,
                       ecg_dir=ECG_PATH,
                       glucose_src='local', # local is much faster
                       )

In [9]:
splitter

<hypopredict.cv.CV_splitter at 0x11719f710>

In [10]:
splitter.__dict__

{'n_splits': 4,
 'random_state': 17,
 'ecg_dir': '/Users/alexxela/code/hypopredict/data/feathers/',
 'glucose_src': 'local'}

In [11]:
splits = splitter.get_splits(non_test_days)

In [12]:
# people are shuffled
# days are shuffled because we test only full days
# and our features are day-based
splits

[array([71, 21, 14, 63, 24]),
 array([44, 61, 13, 52, 54]),
 array([43, 22, 92, 11, 51]),
 array([74, 42, 93, 82, 72])]

In [13]:
# validate needs GLUCOSE_PATH env var set
_ = os.getenv('GLUCOSE_PATH')
_

'/Users/alexxela/code/hypopredict/data/dbt-glucose/'

In [14]:
checks, props = splitter.validate(splits)

--------------------------------------------------
Validating split: [71 21 14 63 24]

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-12_35_54-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-20_29_57-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-17_43_12-2HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-07_01_03-2HG.feather']
--------------------------------------------------
Validating split: [44 61 13 52 54]

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-06_32_58-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-16_49_30-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-61-15_08_04-0HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWav

In [15]:
checks, props

([np.True_, np.True_, np.True_, np.True_],
 array([0.1729, 0.0575, 0.2458, 0.0164]))

In [16]:
splitter.validate(splits, verbose=True)

--------------------------------------------------
Validating split: [71 21 14 63 24]

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-12_35_54-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-20_29_57-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-17_43_12-2HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-07_01_03-2HG.feather']

Split [71 21 14 63 24] is valid with 17.29% of y == 1

--------------------------------------------------
Validating split: [44 61 13 52 54]

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-06_32_58-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-16_49_30-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-61-15_08_04-0HG.feather'

([np.True_, np.True_, np.True_, np.True_],
 array([0.1729, 0.0575, 0.2458, 0.0164]))

In [17]:
splits

[array([71, 21, 14, 63, 24]),
 array([44, 61, 13, 52, 54]),
 array([43, 22, 92, 11, 51]),
 array([74, 42, 93, 82, 72])]

---
### Turn PersonDays into actual ECG chunks

In [18]:
from hypopredict import chunker

In [19]:
split_chunkified = chunker.chunkify(splits[0],
                                    chunk_size=pd.Timedelta(hours=1),
                                    step_size=pd.Timedelta(minutes=5),
                                    ecg_dir=ECG_PATH)


    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-12_35_54-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-20_29_57-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-17_43_12-2HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-07_01_03-2HG.feather']


---
#### We know how to generate label for 1 day, all chunks
### Refactor -> function: list[days] --> dict[days: labels]

In [None]:
from hypopredict import labeler

In [36]:
split_chunkified.keys()

dict_keys([71, 21, 14, 63, 24])

In [None]:
def label_split(split_chunkified,
                glucose_src='local',
                forecast_window=pd.Timedelta(minutes=30)):
    """Generate labels for all chunks in a split.

    Args:
        split_chunkified: dict of day -> list of (chunk_start_time, chunk_end_time)
        glucose_src: str, source of glucose data
        forecast_window: pd.Timedelta, length of forecast window after chunk end time

    Returns:
        np.array of labels for each chunk in the split
    """

    split_labels = dict()

    # for each day in the split
    for day, chunks in split_chunkified.items():
        print(f"Labeling day {day} with {len(chunks)} chunks")

        split_labels[day] = labeler.label_day(day=day,
                                    chunks=chunks,
                                    forecast_window=forecast_window,
                                    glucose_src=glucose_src)

    return split_labels

In [39]:
split_labels = label_split(split_chunkified,
                          glucose_src='local',
                          forecast_window=pd.Timedelta(minutes=30))

Labeling day 71 with 181 chunks
Labeling day 21 with 93 chunks
Labeling day 14 with 87 chunks
Labeling day 63 with 168 chunks
Labeling day 24 with 147 chunks


In [40]:
split_labels[71]

array([-111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

#### Getting chunks and labels for train

1 split chunkified and labeled 
-> Drop chunks with negative labels
--> stack everything into chunks_train, y_train

In [42]:
split_chunkified.keys()

dict_keys([71, 21, 14, 63, 24])

In [43]:
split_labels.keys()

dict_keys([71, 21, 14, 63, 24])

In [None]:
def filter_and_stack(split_chunkified, split_labels):
    """Filter out days with no chunks and stack all chunks and labels into arrays.

    Args:
        split_chunkified: dict of day -> chunks
        split_labels: dict of day -> chunk labels

    Returns:
        valid_chunks: list of (chunk_start_time, chunk_end_time) for all days
        valid_labels: np.array of labels for all chunks
    """

    valid_chunks = []
    valid_labels = []

    for day in split_chunkified.keys():
        chunks = split_chunkified[day]
        labels = split_labels[day]

        # keep those chunks and labels with labels >= 0
        mask = np.array(labels) >= 0
        # by index to preserve order
        chunks = [chunk for i, chunk in enumerate(chunks) if mask[i]]
        labels = [label for i, label in enumerate(labels) if mask[i]]
        # stack them into the final arrays
        valid_chunks.extend(chunks)
        valid_labels.extend(labels)

    return valid_chunks, np.array(valid_labels)

In [46]:
chunks_train, y_train = filter_and_stack(split_chunkified, split_labels)

In [47]:
chunks_train[:5], y_train[:5]

([                         EcgWaveform
  datetime                            
  2014-10-01 10:22:43.435         1958
  2014-10-01 10:22:43.439         1959
  2014-10-01 10:22:43.443         1960
  2014-10-01 10:22:43.447         1961
  2014-10-01 10:22:43.451         1963
  ...                              ...
  2014-10-01 11:22:43.415         2001
  2014-10-01 11:22:43.419         1995
  2014-10-01 11:22:43.423         1990
  2014-10-01 11:22:43.427         1985
  2014-10-01 11:22:43.431         1981
  
  [900000 rows x 1 columns],
                           EcgWaveform
  datetime                            
  2014-10-01 10:27:43.435         1961
  2014-10-01 10:27:43.439         1962
  2014-10-01 10:27:43.443         1962
  2014-10-01 10:27:43.447         1963
  2014-10-01 10:27:43.451         1963
  ...                              ...
  2014-10-01 11:27:43.415         1946
  2014-10-01 11:27:43.419         1948
  2014-10-01 11:27:43.423         1950
  2014-10-01 11:27:43.427       

In [48]:
len(chunks_train)

592

In [49]:
y_train.mean()

np.float64(0.10472972972972973)

---
### This bit labels day-chunks

Gotta apply it to a split

In [None]:
FORECAST_WIINDOW = pd.Timedelta(minutes=30)
labels_chunkified = dict()
labels_chunkified[71] = labeler.label_day(day=71,
                                    chunks=split_chunkified[71],
                                    forecast_window=FORECAST_WIINDOW,
                                    glucose_src=splitter.glucose_src)

In [None]:
labels_chunkified[71]

array([-111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [None]:
split_chunkified.keys()

dict_keys([71, 21, 14, 63, 24])

In [None]:
labels_chunkified[21] = labeler.label_day(day=21,
                                    chunks=split_chunkified[21],
                                    forecast_window=FORECAST_WIINDOW,
                                    glucose_src=splitter.glucose_src)

In [None]:
labels_chunkified[21]

array([-111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111, -111, -111,
       -111, -111, -111, -111, -111, -111, -111, -111, -111,    0,    0,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [None]:
labels_chunkified[14] = labeler.label_day(day=14,
                                    chunks=split_chunkified[14],
                                    forecast_window=FORECAST_WIINDOW,
                                    glucose_src=splitter.glucose_src)

In [None]:
labels_chunkified[14]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
labels_chunkified[63] = labeler.label_day(day=63,
                                    chunks=split_chunkified[63],
                                    forecast_window=FORECAST_WIINDOW,
                                    glucose_src=splitter.glucose_src)

In [None]:
labels_chunkified[63]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
labels_chunkified[24] = labeler.label_day(day=24,
                                    chunks=split_chunkified[24],
                                    forecast_window=FORECAST_WIINDOW,
                                    glucose_src=splitter.glucose_src)

In [None]:
labels_chunkified[24]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

---
#### Example: Create Labels for one CHUNK

Then will refactor into all PersonDays in split and then into all splits

In [20]:
IDday = 71
ID = IDday // 10  # person ID is first digit of day identifier
person = Person(ID, ecg_dir=splitter.ecg_dir)


In [None]:
person.load_HG_data(glucose_src=splitter.glucose_src)

# second number in day is day of recording for that person
day = IDday % 10
person.load_ECG_day(day, warning=False)


In [None]:
person.__dict__.keys()

dict_keys(['ID', 'ecg', 'ecg_dir', 'glucose_src', 'hg_events'])

In [None]:
person.hg_events

Unnamed: 0_level_0,glucose,type,is_hg,onset,end
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-10-01 11:49:59,3.8,cgm,0,0,0
2014-10-01 11:54:59,4.0,cgm,0,0,0
2014-10-01 11:59:59,4.2,cgm,0,0,0
2014-10-01 12:04:59,4.5,cgm,0,0,0
2014-10-01 12:09:59,4.9,cgm,0,0,0
...,...,...,...,...,...
2014-10-05 03:20:01,11.1,cgm,0,0,0
2014-10-05 03:25:01,11.0,cgm,0,0,0
2014-10-05 03:30:01,10.9,cgm,0,0,0
2014-10-05 03:35:01,10.8,cgm,0,0,0


In [None]:
person.ecg[1]

Unnamed: 0_level_0,EcgWaveform
datetime,Unnamed: 1_level_1
2014-10-01 08:42:43.435,3798
2014-10-01 08:42:43.439,3798
2014-10-01 08:42:43.443,3798
2014-10-01 08:42:43.447,3798
2014-10-01 08:42:43.451,3798
...,...
2014-10-01 23:46:17.415,1831
2014-10-01 23:46:17.419,1831
2014-10-01 23:46:17.423,1831
2014-10-01 23:46:17.427,1830


In [None]:
day

1

In [None]:
ecg_start = person.ecg[day].index.min()
ecg_end = person.ecg[day].index.max()
person.hg_events_with_ECG = person.hg_events.loc[ecg_start:ecg_end + FORECAST_WIINDOW] # so last chunk gets predictions if possible

In [None]:
# all glucose and HG measures during ECG recording period
person.hg_events_with_ECG

Unnamed: 0_level_0,glucose,type,is_hg,onset,end
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-10-01 11:49:59,3.8,cgm,0,0,0
2014-10-01 11:54:59,4.0,cgm,0,0,0
2014-10-01 11:59:59,4.2,cgm,0,0,0
2014-10-01 12:04:59,4.5,cgm,0,0,0
2014-10-01 12:09:59,4.9,cgm,0,0,0
...,...,...,...,...,...
2014-10-01 23:54:59,11.3,cgm,0,0,0
2014-10-01 23:59:59,11.2,cgm,0,0,0
2014-10-02 00:04:59,11.0,cgm,0,0,0
2014-10-02 00:09:59,10.9,cgm,0,0,0


In [None]:
FORECAST_WIINDOW = pd.Timedelta(minutes=30)
split_chunkified[71][0].index.max(), split_chunkified[71][0].index.max() + FORECAST_WIINDOW

(Timestamp('2014-10-01 09:42:43.431000'),
 Timestamp('2014-10-01 10:12:43.431000'))

In [None]:
chunk_0_end = split_chunkified[71][0].index.max()

In [None]:
# it's none because glucose started being recorded later than ecg
person.hg_events_with_ECG.loc[chunk_0_end + FORECAST_WIINDOW : chunk_0_end + FORECAST_WIINDOW]

Unnamed: 0_level_0,glucose,type,is_hg,onset,end
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [None]:
# only keep chunks within hg_events_with_ECG

# the should end at least here
bound_low = person.hg_events_with_ECG.index.min() - FORECAST_WIINDOW

In [None]:
bound_low, chunk_0_end

(Timestamp('2014-10-01 11:19:59'), Timestamp('2014-10-01 09:42:43.431000'))

In [None]:
# it should end at most here
bound_up = person.hg_events_with_ECG.index.max() - FORECAST_WIINDOW # cut it if we cant get target label

In [None]:
def get_chunk_end(chunk):
    return chunk.index.max()

In [None]:
split_chunkified_71_ends = np.array(list(map(get_chunk_end, split_chunkified[71])))

In [None]:
with_labels_mask = (split_chunkified_71_ends >= bound_low) & (split_chunkified_71_ends <= bound_up)
split_chunkified_71_ends[with_labels_mask]

array([Timestamp('2014-10-01 11:22:43.431000'),
       Timestamp('2014-10-01 11:27:43.431000'),
       Timestamp('2014-10-01 11:32:43.431000'),
       Timestamp('2014-10-01 11:37:43.431000'),
       Timestamp('2014-10-01 11:42:43.431000'),
       Timestamp('2014-10-01 11:47:43.431000'),
       Timestamp('2014-10-01 11:52:43.431000'),
       Timestamp('2014-10-01 11:57:43.431000'),
       Timestamp('2014-10-01 12:02:43.431000'),
       Timestamp('2014-10-01 12:07:43.431000'),
       Timestamp('2014-10-01 12:12:43.431000'),
       Timestamp('2014-10-01 12:17:43.431000'),
       Timestamp('2014-10-01 12:22:43.431000'),
       Timestamp('2014-10-01 12:27:43.431000'),
       Timestamp('2014-10-01 12:32:43.431000'),
       Timestamp('2014-10-01 12:37:43.431000'),
       Timestamp('2014-10-01 12:42:43.431000'),
       Timestamp('2014-10-01 12:47:43.431000'),
       Timestamp('2014-10-01 12:52:43.431000'),
       Timestamp('2014-10-01 12:57:43.431000'),
       Timestamp('2014-10-01 13:02:43.43

In [None]:
# get idnex of split_chunkified_71_ends with_labels_mask
chunks_w_labels_idx = np.where(with_labels_mask)[0].astype(int).tolist()

In [None]:
chunks_w_labels = [split_chunkified[71][i] for i in chunks_w_labels_idx]

In [None]:
hg_events_forecast = person.hg_events_with_ECG[get_chunk_end(chunks_w_labels[0]): get_chunk_end(chunks_w_labels[0]) + FORECAST_WIINDOW]

In [None]:
hg_events_forecast

Unnamed: 0_level_0,glucose,type,is_hg,onset,end
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-10-01 11:49:59,3.8,cgm,0,0,0


In [None]:
chunk_label = int(hg_events_forecast['is_hg'].max())

In [None]:
chunk_label

0