In [None]:
from unicodedata import category
%load_ext autoreload
%autoreload 2

import pandas as pd

from src.config import INTERIM_DATA_DIR
from src.configurations import Configuration

config = Configuration()
file_name = '15min_iob_cob_bg.csv'
file_path = INTERIM_DATA_DIR / file_name
dtypes = {'id': int, 'iob count': int, 'cob count': int, 'bg count': int}
df = pd.read_csv(file_path, index_col=0, parse_dates=True, dtype=dtypes).drop(columns='system')

In [None]:
# How many full days does each individual have? A full day is defined as having at least 96 15-minute intervals.
full_days = df.groupby('id', group_keys=False).size() // 96
mask_full_days = full_days.sort_values(ascending=False) > 100
df = df.reset_index().set_index(['id','datetime']).sort_index(['id', 'datetime'])
mask = df.index.get_level_values('id').isin(mask_full_days[mask_full_days].index)
df = df[mask]
print(f'People with > 100 days in the dataset: {len(df.index.get_level_values("id").unique())}')

In [None]:
# Now check and remove anyone that is missing all data for any of IOB, COB, BG columns
check_cols = ['iob count', 'cob count', 'bg count']
ids_with_only_nans_or_zeros = {}
for col in check_cols:
    mask = (
        df.groupby('id')[col]
        .apply(lambda x: x.isna().all() or (x.fillna(0) == 0).all())
    )
    ids_with_only_nans_or_zeros[col] = mask[mask].index.tolist()
print(ids_with_only_nans_or_zeros)

ids = set()
for key, val in ids_with_only_nans_or_zeros.items():
    ids.update(set(val))
print(ids)
df = df[~df.index.get_level_values('id').isin(ids)]

In [None]:
# Add the weekday/weekend classification to the df
df['day_type'] = df.index.get_level_values('datetime').weekday.map(lambda x: 'weekend' if x >= 5 else 'weekday').astype('category')
df = pd.get_dummies(df, columns=['day_type'], prefix='day_type')
df.head()

In [None]:
# Add rate of change columns based on iob, cob, bg mean columns
# First add a column for the interval between the previous value and the next. The first value for an id will be NaN
import numpy as np
df['time_diff'] = df.index.get_level_values('datetime').diff()
first_idx = ~df.index.get_level_values('id').duplicated()
df.loc[first_idx, 'time_diff'] = np.nan
df.head()

interval = pd.Timedelta('15min')
# Then add rate columns
for col in ['iob mean', 'cob mean', 'bg mean']:
    value_diff = df[col].groupby(df.index.get_level_values('id')).diff()
    rate_of_change = value_diff.where(df['time_diff'] == interval)
    df[f'{col} rate_of_change'] = rate_of_change

In [None]:


for col in ['iob mean', 'cob mean', 'bg mean']:
    # Calculate the time difference between consecutive rows per id
    time_diff = df.index.get_level_values('datetime').to_series().groupby(df.index.get_level_values('id')).diff()
    print(time_diff.head())
    # Calculate the difference in the column per id
    value_diff = df[col].groupby(df.index.get_level_values('id')).diff()
    print(value_diff.head())
    # Only keep the rate of change where the time difference is exactly 15 minutes
    rate_of_change = value_diff.where(time_diff == interval)
    df[f'{col} rate_of_change'] = rate_of_change

In [None]:
# Implement minmax scaling