In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from hypopredict.fusion.ml_preproc import MLPreprocessor
import pandas as pd
import numpy as np

In [3]:
mlp = MLPreprocessor()

In [4]:
path = '/Users/alexxela/code/hypopredict/data/processed/ml_prepped_cv_splits_20251216_125112.pkl'
loaded_dict = MLPreprocessor().load_prepped_splits(path)

In [5]:
splits_prepped = loaded_dict['splits_prepped']
len(splits_prepped)

5

In [6]:
loaded_dict['params']

{'ecg_dir': '/Users/alexxela/code/hypopredict/data/feathers/',
 'glucose_src': 'local',
 'n_splits': 5,
 'chunk_size': Timedelta('0 days 01:00:00'),
 'step_size': Timedelta('0 days 00:10:00'),
 'forecast_window': Timedelta('0 days 01:30:00'),
 'roll_window_size': Timedelta('0 days 00:40:00'),
 'roll_step_size': Timedelta('0 days 00:02:00'),
 'suffix': 'rolling',
 'agg_funcs': ['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis'],
 'random_state': 17}

---
#### Ready to CV ML Models!

In [None]:
# instantiate CV
from hypopredict.cv import CrossValidator

In [7]:
cv = CrossValidator(splits=splits_prepped)

In [8]:
# initialize XGBoost model
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=999,
    max_depth=2,
    reg_lambda=0.1,
    learning_rate=0.1,
    eval_metric='logloss',
    random_state=17
)

In [9]:
cv_results_xgb = cv.validate_model_cv(
                            xgb,
                            splits_prepped,
                            resample=True,
                            desired_pos_ratio=0.4,
                            reduction_factor=0.7
                )

In [12]:
print("XGBoost mean CV PR AUCs:")
print(np.mean(cv_results_xgb['val_pr_aucs']))
print("XGBoost mean CV ave Prec:")
print(np.mean(cv_results_xgb['val_ave_precisions']))
print("Baseline:")
print(np.mean(cv._get_split_mean_labels(splits_prepped)))


XGBoost mean CV PR AUCs:
0.48850000000000005
XGBoost mean CV ave Prec:
0.4168
Baseline:
0.21226000000000003


In [14]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#svm = SVC(kernel ='poly', degree=3, C=100)
#svm.fit(X_train_features, y_train)

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", C=1.0, gamma="scale", probability=True))
])

In [16]:
cv_results_svm = cv.validate_model_cv(
                            svm,
                            splits_prepped,
                            resample=True,
                            desired_pos_ratio=0.4,
                            reduction_factor=0.7
                )

IndexError: index 5400106 is out of bounds for axis 0 with size 3568387

In [None]:
print("SVM mean CV PR AUCs:")
print(np.mean(cv_results_svm['val_pr_aucs']))
print("SVM mean CV ave Prec:")
print(np.mean(cv_results_svm['val_ave_precisions']))
print("Baseline:")
print(np.mean(cv._get_split_mean_labels(splits_prepped)))


---

#### Some usage

In [None]:
mlp.set_params(chunk_size=pd.Timedelta(minutes=10),
                roll_window_size=pd.Timedelta(minutes=30))

In [None]:
mlp.params

{'ecg_dir': '/Users/alexxela/code/hypopredict/data/feathers/',
 'glucose_src': 'local',
 'n_splits': 5,
 'chunk_size': Timedelta('0 days 00:10:00'),
 'step_size': Timedelta('0 days 00:10:00'),
 'forecast_window': Timedelta('0 days 01:30:00'),
 'roll_window_size': Timedelta('0 days 00:30:00'),
 'roll_step_size': Timedelta('0 days 00:02:00'),
 'suffix': 'rolling',
 'agg_funcs': ['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis'],
 'random_state': 17}

In [None]:
mlp.reset_params()

In [None]:
mlp.params

{'ecg_dir': '/Users/alexxela/code/hypopredict/data/feathers/',
 'glucose_src': 'local',
 'n_splits': 5,
 'chunk_size': Timedelta('0 days 01:00:00'),
 'step_size': Timedelta('0 days 00:10:00'),
 'forecast_window': Timedelta('0 days 01:30:00'),
 'roll_window_size': Timedelta('0 days 00:40:00'),
 'roll_step_size': Timedelta('0 days 00:02:00'),
 'suffix': 'rolling',
 'agg_funcs': ['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis'],
 'random_state': 17}

In [None]:
from hypopredict.params import TRAIN_DAYS

In [None]:
mlp.split(TRAIN_DAYS)

In [None]:
mlp.splits

array([[71, 21, 14, 63],
       [24, 44, 61, 13],
       [52, 54, 43, 22],
       [92, 11, 51, 74],
       [42, 93, 82, 72]])

In [None]:
# 20 mins to run
#mlp.preprocess()

Labeling day 71 with 91 chunks
Labeling day 21 with 47 chunks
Labeling day 14 with 44 chunks
Labeling day 63 with 84 chunks
Labeling day 24 with 74 chunks
Labeling day 44 with 68 chunks
Labeling day 61 with 117 chunks
Labeling day 13 with 77 chunks
Labeling day 52 with 82 chunks
Labeling day 54 with 45 chunks
Labeling day 43 with 49 chunks
Labeling day 22 with 80 chunks
Labeling day 92 with 87 chunks
Labeling day 11 with 74 chunks
Labeling day 51 with 71 chunks
Labeling day 74 with 83 chunks
Labeling day 42 with 74 chunks
Labeling day 93 with 71 chunks
Labeling day 82 with 104 chunks
Labeling day 72 with 66 chunks


In [None]:
type(mlp.splits_prepped)

list

In [None]:
len(mlp.splits_prepped)  # number of splits

5

In [None]:
import pickle
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

filepath = f'/Users/alexxela/code/hypopredict/data/processed/'
filename = f'ml_prepped_cv_splits_{timestamp}.pkl'

dict_to_save = {"splits_prepped": mlp.splits_prepped, "params": mlp.params}

with open(filepath + filename, "wb") as f:
    pickle.dump(dict_to_save, f)

In [None]:
# get current time for filename
mlp.save_prepped_splits(f'/Users/alexxela/code/hypopredict/data/processed/ml_prepped_cv_splits_{timestamp}.pkl')

AttributeError: 'MLPreprocessor' object has no attribute 'save_prepped_splits'