In [6]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from src.configurations import Configuration

config = Configuration()
df = pd.read_csv(config.final_filtered_csv)
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index(['id', 'datetime'])
df.info()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7694 entries, (np.int64(41131654), Timestamp('2019-09-12 18:00:00')) to (np.int64(86025410), Timestamp('2017-05-03 10:30:00'))
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iob mean   7694 non-null   float64
 1   cob mean   7680 non-null   float64
 2   bg mean    7694 non-null   float64
 3   iob min    7694 non-null   float64
 4   cob min    7680 non-null   float64
 5   bg min     7694 non-null   float64
 6   iob max    7694 non-null   float64
 7   cob max    7680 non-null   float64
 8   bg max     7694 non-null   float64
 9   iob std    6575 non-null   float64
 10  cob std    6544 non-null   float64
 11  bg std     6575 non-null   float64
 12  iob count  7694 non-null   int64  
 13  cob count  7694 non-null   int64  
 14  bg count   7694 non-null   int64  
 15  offset     7694 n

In [7]:
# Add the weekday/weekend classification to the df
df['day_type'] = df.index.get_level_values('datetime').weekday.map(lambda x: 'weekend' if x >= 5 else 'weekday').astype('category')
df = pd.get_dummies(df, columns=['day_type'], prefix='day_type')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,cob std,bg std,iob count,cob count,bg count,offset,day,time,day_type_weekday,day_type_weekend
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
41131654,2019-09-12 18:00:00,3.225,0.0,264.667,2.113,0.0,257.0,3.886,0.0,274.0,0.969,0.0,8.622,3,3,3,12,2019-09-12,18:00:00,True,False
41131654,2019-09-12 18:30:00,3.554,39.0,257.0,1.839,37.0,253.0,4.712,40.0,261.0,1.515,1.732,4.0,3,3,3,12,2019-09-12,18:30:00,True,False
41131654,2019-09-12 19:00:00,5.462,35.5,262.5,4.643,34.0,261.0,6.281,37.0,264.0,1.158,2.121,2.121,2,2,2,12,2019-09-12,19:00:00,True,False
41131654,2019-09-12 19:30:00,3.016,3.0,254.0,3.016,3.0,254.0,3.016,3.0,254.0,,,,1,1,1,12,2019-09-12,19:30:00,True,False
41131654,2019-09-12 21:30:00,-0.831,0.0,133.75,-1.189,0.0,126.0,0.018,0.0,149.0,0.571,0.0,10.844,4,4,4,12,2019-09-12,21:30:00,True,False


In [8]:
# Add rate of change columns based on iob, cob, bg mean columns
# First add a column for the interval between the previous value and the next. The first value for an id will be NaN
import numpy as np
df['time_diff'] = df.index.get_level_values('datetime').diff()
first_idx = ~df.index.get_level_values('id').duplicated()
df.loc[first_idx, 'time_diff'] = np.nan
df.head()

interval = pd.Timedelta('30min')
# Then add rate columns
for col in ['iob mean', 'cob mean', 'bg mean']:
    value_diff = df[col].groupby(df.index.get_level_values('id')).diff()
    rate_of_change = value_diff.where(df['time_diff'] == interval)
    df[f'{col} rate_of_change'] = rate_of_change

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,...,bg count,offset,day,time,day_type_weekday,day_type_weekend,time_diff,iob mean rate_of_change,cob mean rate_of_change,bg mean rate_of_change
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
41131654,2019-09-12 18:00:00,3.225,0.0,264.667,2.113,0.0,257.0,3.886,0.0,274.0,0.969,...,3,12,2019-09-12,18:00:00,True,False,NaT,,,
41131654,2019-09-12 18:30:00,3.554,39.0,257.0,1.839,37.0,253.0,4.712,40.0,261.0,1.515,...,3,12,2019-09-12,18:30:00,True,False,0 days 00:30:00,0.329,39.0,-7.667
41131654,2019-09-12 19:00:00,5.462,35.5,262.5,4.643,34.0,261.0,6.281,37.0,264.0,1.158,...,2,12,2019-09-12,19:00:00,True,False,0 days 00:30:00,1.908,-3.5,5.5
41131654,2019-09-12 19:30:00,3.016,3.0,254.0,3.016,3.0,254.0,3.016,3.0,254.0,,...,1,12,2019-09-12,19:30:00,True,False,0 days 00:30:00,-2.446,-32.5,-8.5
41131654,2019-09-12 21:30:00,-0.831,0.0,133.75,-1.189,0.0,126.0,0.018,0.0,149.0,0.571,...,4,12,2019-09-12,21:30:00,True,False,0 days 02:00:00,,,


Now we have some idea of the sorts of features, we can look at scripting the functions in our class to build the feature set. We break down the features derived from the random variables and time series:
1. Time-based variables:
a) One-hot variable identifying weekday/weekends
b) The hour of the day of the time series
c) Trigonometric functions of the timestamps
2. Resampled means
a) Rate of change from previous interval
b) Rate of change from previous 2 hours
c) Hourly mean
3. Resampled maximums
a) Peaks greater than the mean for the time series
The mean won't necessarily be the best value here.

In [26]:
from src.features import FeatureSet

features = FeatureSet(input_path=config.final_filtered_csv)

scale_columns = ['iob mean', 'cob mean', 'bg mean',
                 'iob min', 'cob min', 'bg min',
                 'iob max', 'cob max', 'bg max',
                 'iob std', 'cob std', 'bg std']
mean_columns = ['iob mean', 'cob mean', 'bg mean']

# Add all features
features.add_time_based_features()
features.add_day_type()
features.add_rate_of_change(columns=mean_columns, interval='30min')
features.add_peaks_above_mean()

# Scale variables
features.scale_features(scale_columns)

features.df


Unnamed: 0_level_0,Unnamed: 1_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,...,hour_cos,day_type_weekday,day_type_weekend,time_diff,iob mean rate_of_change,cob mean rate_of_change,bg mean rate_of_change,iob max_peaks_above_mean,cob max_peaks_above_mean,bg max_peaks_above_mean
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
41131654,2019-09-12 18:00:00,0.270057,0.000000,0.623390,0.243616,0.000000,0.602210,0.293407,0.000000,0.649171,0.127115,...,-1.836970e-16,True,False,NaT,,,,False,False,False
41131654,2019-09-12 18:30:00,0.285132,0.325000,0.602210,0.231076,0.308333,0.591160,0.330379,0.330579,0.613260,0.198741,...,-1.836970e-16,True,False,0 days 00:30:00,0.329,39.0,-7.667,False,True,False
41131654,2019-09-12 19:00:00,0.372554,0.295833,0.617403,0.359405,0.283333,0.613260,0.400609,0.305785,0.621547,0.151909,...,2.588190e-01,True,False,0 days 00:30:00,1.908,-3.5,5.500,True,False,True
41131654,2019-09-12 19:30:00,0.260481,0.025000,0.593923,0.284943,0.025000,0.593923,0.254465,0.024793,0.593923,,...,2.588190e-01,True,False,0 days 00:30:00,-2.446,-32.5,-8.500,False,False,False
41131654,2019-09-12 21:30:00,0.084215,0.000000,0.261740,0.092494,0.000000,0.240331,0.120272,0.000000,0.303867,0.074905,...,7.071068e-01,True,False,0 days 02:00:00,,,,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86025410,2017-05-03 08:30:00,0.143414,0.000000,0.382597,0.167826,0.000000,0.381215,0.140235,0.000000,0.383978,0.000656,...,-5.000000e-01,True,False,0 days 00:30:00,-0.042,0.0,-3.100,False,False,False
86025410,2017-05-03 09:00:00,0.144605,0.000000,0.353591,0.167918,0.000000,0.345304,0.142563,0.000000,0.361878,0.005247,...,-7.071068e-01,True,False,0 days 00:30:00,0.026,0.0,-10.500,True,False,False
86025410,2017-05-03 09:30:00,0.137136,0.000000,0.290055,0.159451,0.000000,0.287293,0.137639,0.000000,0.292818,0.009445,...,-7.071068e-01,True,False,0 days 00:30:00,-0.163,0.0,-23.000,False,False,False
86025410,2017-05-03 10:00:00,0.134708,0.000000,0.281768,0.156796,0.000000,0.265193,0.133477,0.000000,0.287293,0.004854,...,-8.660254e-01,True,False,0 days 00:30:00,-0.053,0.0,-3.000,False,False,False
