In [5]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from src.config import INTERIM_DATA_DIR
from src.configurations import Configuration, FifteenMinute

fifteen_minute = FifteenMinute()
resampled_parquet_file = INTERIM_DATA_DIR / fifteen_minute.file_name('parquet')  # Pipeline output file

config = Configuration()
# The following dtype and index setting should be fixed in the next execution of the pipeline - 2025/06/09
dtypes = {'id': int, 'iob count': int, 'cob count': int, 'bg count': int}
df = pd.read_parquet(resampled_parquet_file).drop(columns='system').astype(dtypes)
df = df.set_index(['id','datetime'])
df.info()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 685122 entries, (np.int64(221634), Timestamp('2018-03-16 20:30:00')) to (np.int64(99908129), Timestamp('2018-02-01 04:45:00'))
Data columns (total 15 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   iob mean   669370 non-null  Float32
 1   cob mean   644780 non-null  Float32
 2   bg mean    685122 non-null  Float32
 3   iob min    669370 non-null  Float32
 4   cob min    644780 non-null  Float32
 5   bg min     685122 non-null  Float32
 6   iob max    669370 non-null  Float32
 7   cob max    644780 non-null  Float32
 8   bg max     685122 non-null  Float32
 9   iob std    337473 non-null  Float32
 10  cob std    328312 non-null  Float32
 11  bg std     344576 non-null  Float32
 12  iob count  685122 non-null  int64  
 13  cob count  685122 non-null  int64  
 14  bg count   685122 non-null  int64  
dtype

In [6]:
# Add the weekday/weekend classification to the df
df['day_type'] = df.index.get_level_values('datetime').weekday.map(lambda x: 'weekend' if x >= 5 else 'weekday').astype('category')
df = pd.get_dummies(df, columns=['day_type'], prefix='day_type')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,cob std,bg std,iob count,cob count,bg count,day_type_weekday,day_type_weekend
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
221634,2018-03-16 20:30:00,1.665,0.0,138.0,1.665,0.0,138.0,1.665,0.0,138.0,,,,1,1,1,True,False
221634,2018-03-16 21:00:00,1.04,0.932,124.0,1.04,0.932,124.0,1.04,0.932,124.0,,,,1,1,1,True,False
221634,2018-03-16 22:00:00,0.152,0.0,139.0,0.152,0.0,139.0,0.152,0.0,139.0,,,,1,1,1,True,False
221634,2018-03-21 16:30:00,3.541,0.0,329.0,3.384,0.0,328.0,3.697,0.0,330.0,0.221,0.0,1.414,2,2,2,True,False
221634,2018-03-21 17:00:00,2.299,0.0,311.0,2.299,0.0,311.0,2.299,0.0,311.0,,,,1,1,1,True,False


In [7]:
# Add rate of change columns based on iob, cob, bg mean columns
# First add a column for the interval between the previous value and the next. The first value for an id will be NaN
import numpy as np
df['time_diff'] = df.index.get_level_values('datetime').diff()
first_idx = ~df.index.get_level_values('id').duplicated()
df.loc[first_idx, 'time_diff'] = np.nan
df.head()

interval = pd.Timedelta('15min')
# Then add rate columns
for col in ['iob mean', 'cob mean', 'bg mean']:
    value_diff = df[col].groupby(df.index.get_level_values('id')).diff()
    rate_of_change = value_diff.where(df['time_diff'] == interval)
    df[f'{col} rate_of_change'] = rate_of_change

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,...,bg std,iob count,cob count,bg count,day_type_weekday,day_type_weekend,time_diff,iob mean rate_of_change,cob mean rate_of_change,bg mean rate_of_change
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
221634,2018-03-16 20:30:00,1.665,0.0,138.0,1.665,0.0,138.0,1.665,0.0,138.0,,...,,1,1,1,True,False,NaT,,,
221634,2018-03-16 21:00:00,1.04,0.932,124.0,1.04,0.932,124.0,1.04,0.932,124.0,,...,,1,1,1,True,False,0 days 00:30:00,,,
221634,2018-03-16 22:00:00,0.152,0.0,139.0,0.152,0.0,139.0,0.152,0.0,139.0,,...,,1,1,1,True,False,0 days 01:00:00,,,
221634,2018-03-21 16:30:00,3.541,0.0,329.0,3.384,0.0,328.0,3.697,0.0,330.0,0.221,...,1.414,2,2,2,True,False,4 days 18:30:00,,,
221634,2018-03-21 17:00:00,2.299,0.0,311.0,2.299,0.0,311.0,2.299,0.0,311.0,,...,,1,1,1,True,False,0 days 00:30:00,,,


Now we have some idea of the sorts of features, we can look at scripting the functions in our class to build the feature set.

In [19]:
from src.features import FeatureSet

features = FeatureSet(input_path=resampled_parquet_file)

scale_columns = ['iob mean', 'cob mean', 'bg mean',
                         'iob min', 'cob min', 'bg min',
                         'iob max', 'cob max', 'bg max',
                         'iob std', 'cob std', 'bg std']
mean_columns = ['iob mean', 'cob mean', 'bg mean']

# Add all features
features.add_time_based_features()
features.add_day_type()
features.add_rate_of_change(mean_columns)

# Scale variables
features.scale_features(scale_columns)


In [20]:
features.dataset.dtypes

iob mean                           float64
cob mean                           float64
bg mean                            float64
iob min                            float64
cob min                            float64
bg min                             float64
iob max                            float64
cob max                            float64
bg max                             float64
iob std                            float64
cob std                            float64
bg std                             float64
iob count                            Int64
cob count                            int64
bg count                             Int64
hour_of_day                        float64
hour_sin                           float64
hour_cos                           float64
day_type_weekday                      bool
day_type_weekend                      bool
time_diff                  timedelta64[ns]
iob mean rate_of_change            Float32
cob mean rate_of_change            Float32
bg mean rat

In [21]:
features.dataset.describe()

Unnamed: 0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,...,iob count,cob count,bg count,hour_of_day,hour_sin,hour_cos,time_diff,iob mean rate_of_change,cob mean rate_of_change,bg mean rate_of_change
count,669370.0,644780.0,685122.0,669370.0,644780.0,685122.0,669370.0,644780.0,685122.0,337473.0,...,685122.0,685122.0,685122.0,685122.0,685122.0,685122.0,685008,551366.0,531873.0,562089.0
mean,0.171673,0.040515,0.234425,0.181411,0.037785,0.231102,0.169801,0.043216,0.237689,0.01221,...,1.742217,1.686241,1.779395,11.347829,0.02268828,-0.004965105,0 days 01:13:26.406348538,0.026162,0.045619,-0.10004
std,0.045906,0.084399,0.111256,0.045788,0.082937,0.110796,0.046104,0.08824,0.112058,0.027028,...,1.047411,1.080659,1.019488,6.892166,0.7050933,0.7087351,10 days 23:12:36.661222375,1.022814,9.913558,14.025735
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-1.0,-1.0,0 days 00:15:00,-11.63,-180.0,-232.0
25%,0.142379,0.0,0.156522,0.152601,0.0,0.154348,0.140082,0.0,0.158696,0.002191,...,1.0,1.0,1.0,5.0,-0.7071068,-0.7071068,0 days 00:15:00,-0.314,-1.833,-6.5
50%,0.157186,0.0,0.208696,0.166882,0.0,0.206522,0.155211,0.0,0.21087,0.005211,...,1.0,1.0,2.0,11.0,1.224647e-16,-1.83697e-16,0 days 00:15:00,-0.077,0.0,-0.667
75%,0.186006,0.042674,0.28913,0.195276,0.030043,0.284783,0.184508,0.046809,0.293478,0.011785,...,2.0,2.0,2.0,17.0,0.7071068,0.7071068,0 days 00:15:00,0.096,0.0,5.667007
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,14.0,14.0,14.0,23.0,1.0,1.0,6322 days 18:30:00,23.089001,200.0,237.0
