In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import os

from hypopredict.cv import CV_splitter
from hypopredict import chunker
from hypopredict import labeler
from hypopredict.params import TRAIN_DAYS

from hypopredict.cv import CrossValidator

In [3]:
ECG_PATH = os.getenv('ECG_PATH')

#######
# chunking strategy
CHUNK_SIZE = pd.Timedelta(minutes=60)
STEP_SIZE = pd.Timedelta(minutes=10)

#######
# labeling strategy
FORECAST_WINDOW = pd.Timedelta(minutes=90)


######
# rolling features
WINDOW_SIZE_FEATURES = pd.Timedelta(minutes=40)
STEP_SIZE_FEATURES = pd.Timedelta(minutes=2)


# initialize CV splitter
splitter = CV_splitter(n_splits = 5,
                       ecg_dir = ECG_PATH,
                       glucose_src='local',
                       random_state = 17)
# get splits
splits = splitter.get_splits(TRAIN_DAYS)

crossval = CrossValidator(splits = splits)

splits_prepped = crossval.chunkify_label_stack(
    chunk_size=CHUNK_SIZE,
    step_size=STEP_SIZE,
    ecg_dir=ECG_PATH,
    glucose_src='local',
    forecast_window=FORECAST_WINDOW,
    roll_window_size=WINDOW_SIZE_FEATURES,
    roll_step_size=STEP_SIZE_FEATURES,
    suffix=f'roll{WINDOW_SIZE_FEATURES.components.minutes}min',
    agg_funcs=['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis']
)


    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-12_35_54-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-21-20_29_57-1HG.feather']
Labeling day 71 with 91 chunks
Labeling day 21 with 47 chunks
Labeling day 14 with 44 chunks
Labeling day 63 with 84 chunks

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-17_43_12-2HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-24-07_01_03-2HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-06_32_58-1HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-44-16_49_30-1HG.feather']

    Files concatinated:
                 ['/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-61-15_08_04-0HG.feather', '/Users/alexxela/code/hypopredict/data/feathers/EcgWaveform-61-06_46_44-0HG.feather']
Labeling day 24 

In [4]:
print(crossval._get_split_mean_labels(splits_prepped))

print(type(splits_prepped))

[np.float64(0.0084), np.float64(0.6664), np.float64(0.1801), np.float64(0.1308), np.float64(0.0756)]
<class 'list'>


In [48]:
# initialize XGBoost model
from xgboost import XGBClassifier
model = XGBClassifier(
    n_estimators=777,
    max_depth=5,
    reg_lambda=0.1,
    learning_rate=0.2,
    eval_metric='logloss',
    random_state=17
)

In [49]:
cv_results = crossval.validate_model_cv(model, splits_prepped,
                                        resample=True,
                                        desired_pos_ratio=0.3,
                                        reduction_factor=0.555)

Cross-Validation Iteration: Using split 0 as validation set

                        With mean positive class ratio: 0.008

                
Resampling training folds [1, 2, 3, 4] 
 to achieve ~0.3 positive class ratio...
RESAMPLED
Train positive class ratio: 0.381
--------------------------------------------------
Fitting model on training folds [1, 2, 3, 4]...
Evaluating model on training folds [1, 2, 3, 4]...

                    TRAIN PR-AUC: 0.6868, Average Precision: 0.3817
                

                ••••••••••••••••••••••••••••••••••••••••••••••••••••••••
                  
Evaluating model on VALIDATION fold 0...

                    VALIDATION PR-AUC: 0.0081, Average Precision: 0.0086
                

                ••••••••••••••••••••••••••••••••••••••••••••••••••••••••


                  
Cross-Validation Iteration: Using split 1 as validation set

                        With mean positive class ratio: 0.666

                
Resampling training folds [0, 2, 3, 4

In [None]:
# looks like it's kind of learning
print(np.mean(cv_results['val_pr_aucs']))
# 2.25x better than random chance
print(np.mean(crossval._get_split_mean_labels(splits_prepped)))

0.46889999999999993
0.21226000000000003


In [None]:
# however the first 2 splits have crazy imbalances with different directioons
# that confuses our model for sure
# but even here now it seems to be learning something
print(np.mean(cv_results['val_pr_aucs'][:2]))
print(crossval._get_split_mean_labels(splits_prepped)[:2])
print(np.mean(crossval._get_split_mean_labels(splits_prepped)[:2]))

0.34750000000000003
[np.float64(0.0084), np.float64(0.6664)]
0.3374


In [None]:
# on more reasonable classes it is learning!!
# more than 4x better than random chance
# with really silly features! and silly random resampling!
print(np.mean(cv_results['val_pr_aucs'][2:]))
print(np.mean(crossval._get_split_mean_labels(splits_prepped)[2:]))

0.49924999999999997
0.12883333333333333
