# Candidate Selection
The candidate selection process follows the logic below, find people that meet these criteria:
1. That have a single timezone such that there are no shifts in the time series, which will distort modelling and comparison.
2. That aren't missing any of the variables used, IOB, COB, IG across the dataset.
2. That have continuous data over a broad window that can capture overnight fasting, we will set arbitrarily to 19:00-12:00.
3. Periods will then be excluded if they lack IG for an hour or later, given this may impact analysis.
4. Following this, the minimum number of nights with such data for a candidate will be set arbitrarily to 100, providing a threshold for sufficient data.


In [39]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from datetime import timedelta, time, datetime

from src.config import INTERIM_DATA_DIR
from src.configurations import Configuration, FifteenMinute
from src.helper import check_df_index

fifteen_minute = FifteenMinute()
resampled_parquet_file = INTERIM_DATA_DIR / fifteen_minute.file_name('parquet')  # Pipeline output file

config = Configuration()
# The following dtype and index setting should be fixed in the next execution of the pipeline - 2025/06/09
dtypes = {'id': int, 'iob count': int, 'cob count': int, 'bg count': int}
df = pd.read_parquet(resampled_parquet_file).drop(columns='system').astype(dtypes)
df = df.set_index(['id','datetime'])
df = check_df_index(df)
df.info()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 685122 entries, (np.int64(221634), Timestamp('2018-03-16 20:30:00')) to (np.int64(99908129), Timestamp('2018-02-01 04:45:00'))
Data columns (total 15 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   iob mean   669370 non-null  Float32
 1   cob mean   644780 non-null  Float32
 2   bg mean    685122 non-null  Float32
 3   iob min    669370 non-null  Float32
 4   cob min    644780 non-null  Float32
 5   bg min     685122 non-null  Float32
 6   iob max    669370 non-null  Float32
 7   cob max    644780 non-null  Float32
 8   bg max     685122 non-null  Float32
 9   iob std    337473 non-null  Float32
 10  cob std    328312 non-null  Float32
 11  bg std     344576 non-null  Float32
 12  iob count  685122 non-null  int64  
 13  cob count  685122 non-null  int64  
 14  bg count   685122 non-null  int64  
dtype

The step will check and remove anyone that is missing all data for any of IOB, COB, BG columns, using the count columns.

In [40]:
check_cols = ['iob count', 'cob count', 'bg count']
ids_with_only_nans_or_zeros = {}
for col in check_cols:
    mask = (
        df.groupby('id')[col]
        .apply(lambda x: x.isna().all() or (x.fillna(0) == 0).all())
    )
    ids_with_only_nans_or_zeros[col] = mask[mask].index.tolist()
print(ids_with_only_nans_or_zeros)

ids = set()
for key, val in ids_with_only_nans_or_zeros.items():
    ids.update(set(val))
print(ids)

{'iob count': [], 'cob count': [49551394], 'bg count': []}
{49551394}


How many full days does each individual have? A full day is defined as having 96 15-minute intervals. The following filters based on that.

In [41]:
full_days = df.groupby('id', group_keys=False).size() // 96
mask_full_days = full_days.sort_values(ascending=False) > 100
df = df.reset_index().set_index(['id','datetime']).sort_index()
mask = df.index.get_level_values('id').isin(mask_full_days[mask_full_days].index)
print(f'People with > 100 days in the dataset: {len(df[mask].index.get_level_values("id").unique())}')

People with > 100 days in the dataset: 19


However, full days aren't what we focus on, we need to identify periods between a start time on one day and an end-time the next morning, such that we can limit the period of focus, e.g. 19:00 - 11:00. The following are functions created for this purpose. The idea being that it will split the dataset into an array of dataframes for each night for an individual, and we can then analyse the nights.


In [55]:
class Nights:
    def __init__(self, df, night_start=time(19, 0), morning_end=time(12, 0), sample_rate = 15):
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("DataFrame index must be a DatetimeIndex.")
        self.sample_rate = sample_rate
        self.df = df.sort_index()
        self.night_start = night_start
        self.morning_end = morning_end
        self.nights = self._split_nights()
        self.interval_count = None

    def _split_nights(self):
        nights = []
        dates = pd.to_datetime(self.df.index.date)
        for date in pd.unique(dates):
            date_obj = pd.Timestamp(date).date()
            night_start_dt = pd.Timestamp.combine(date_obj, self.night_start)
            morning_end_dt = pd.Timestamp.combine(date_obj + timedelta(days=1), self.morning_end)
            mask = (self.df.index >= night_start_dt) & (self.df.index < morning_end_dt)
            night_df = self.df.loc[mask]
            if not night_df.empty:
                nights.append(night_df)

            # Calculate and store the number of intervals for other calculation
            diff_minutes = (morning_end_dt - night_start_dt).total_seconds() / 60
            self.interval_count = int(diff_minutes // self.sample_rate)

        return nights

    def remove_incomplete_nights(self):
        """
        Removes any nights that do not have a timestamp at each 15 minute interval.
        """
        complete_nights = []
        for night_df in self.nights:
            if night_df.index.nunique() == self.interval_count:
                complete_nights.append(night_df)
        self.nights = complete_nights

    def calculate_stats(self):
        pass

    def get_nights(self):
        return self.nights

In [43]:
# This separates all dataframes for each person into an array of dataframes, making it available to iterate through, or return any individual easily.
df_tuple_by_id = []
for group in df.groupby('id'):
    df_tuple_by_id.append(group)

print(f'Number of dfs: {len(df_tuple_by_id)}')

for i, (id_val, df) in enumerate(df_tuple_by_id):
    df = df.reset_index().drop(columns='id').set_index('datetime')
    df_tuple_by_id[i] = (id_val, df)

df_individual = df_tuple_by_id[0][1]
df_individual.head()

Number of dfs: 114


Unnamed: 0_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,cob std,bg std,iob count,cob count,bg count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-03-16 20:30:00,1.665,0.0,138.0,1.665,0.0,138.0,1.665,0.0,138.0,,,,1,1,1
2018-03-16 21:00:00,1.04,0.932,124.0,1.04,0.932,124.0,1.04,0.932,124.0,,,,1,1,1
2018-03-16 22:00:00,0.152,0.0,139.0,0.152,0.0,139.0,0.152,0.0,139.0,,,,1,1,1
2018-03-21 16:30:00,3.541,0.0,329.0,3.384,0.0,328.0,3.697,0.0,330.0,0.221,0.0,1.414,2,2,2
2018-03-21 17:00:00,2.299,0.0,311.0,2.299,0.0,311.0,2.299,0.0,311.0,,,,1,1,1


In [56]:
nights = Nights(df=df_individual, sample_rate=15)
print(f'Preliminarily separated nights: {len(nights.get_nights())}')
nights.remove_incomplete_nights()
print(f'Following the removal of complete nights: {len(nights.get_nights())}')

Preliminarily separated nights: 104
Following the removal of complete nights: 0


In [53]:
nights.nights

[                     iob mean  cob mean  bg mean  iob min  cob min  bg min  \
 datetime                                                                     
 2018-03-16 20:30:00     1.665       0.0    138.0    1.665      0.0   138.0   
 2018-03-16 21:00:00      1.04     0.932    124.0     1.04    0.932   124.0   
 2018-03-16 22:00:00     0.152       0.0    139.0    0.152      0.0   139.0   
 
                      iob max  cob max  bg max  iob std  cob std  bg std  \
 datetime                                                                  
 2018-03-16 20:30:00    1.665      0.0   138.0     <NA>     <NA>    <NA>   
 2018-03-16 21:00:00     1.04    0.932   124.0     <NA>     <NA>    <NA>   
 2018-03-16 22:00:00    0.152      0.0   139.0     <NA>     <NA>    <NA>   
 
                      iob count  cob count  bg count  
 datetime                                             
 2018-03-16 20:30:00          1          1         1  
 2018-03-16 21:00:00          1          1         1  
 

In [46]:
df_individual

Unnamed: 0_level_0,iob mean,cob mean,bg mean,iob min,cob min,bg min,iob max,cob max,bg max,iob std,cob std,bg std,iob count,cob count,bg count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-03-16 20:30:00,1.665,0.0,138.0,1.665,0.0,138.0,1.665,0.0,138.0,,,,1,1,1
2018-03-16 21:00:00,1.04,0.932,124.0,1.04,0.932,124.0,1.04,0.932,124.0,,,,1,1,1
2018-03-16 22:00:00,0.152,0.0,139.0,0.152,0.0,139.0,0.152,0.0,139.0,,,,1,1,1
2018-03-21 16:30:00,3.541,0.0,329.0,3.384,0.0,328.0,3.697,0.0,330.0,0.221,0.0,1.414,2,2,2
2018-03-21 17:00:00,2.299,0.0,311.0,2.299,0.0,311.0,2.299,0.0,311.0,,,,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-07-11 01:00:00,0.848,0.0,133.0,0.848,0.0,133.0,0.848,0.0,133.0,,,,1,1,1
2018-07-11 01:30:00,0.759,0.0,124.0,0.759,0.0,124.0,0.759,0.0,124.0,,,,1,1,1
2018-07-11 01:45:00,0.635,0.0,123.5,0.563,0.0,123.0,0.706,0.0,124.0,0.101,0.0,0.707,2,2,2
2018-07-11 02:30:00,0.523,0.0,112.0,0.523,0.0,112.0,0.523,0.0,112.0,,,,1,1,1
