# Candidate Selection
The candidate selection process follows the logic below, find people that meet these criteria:
1. That have a single timezone such that there are no shifts in the time series, which will distort modelling and comparison.
2. That aren't missing any of the variables used, IOB, COB, IG across the dataset.
2. That have continuous data over a broad window that can capture overnight fasting, we will set arbitrarily to 19:00-12:00.
3. Periods will then be excluded if they lack IG for an hour or later, given this may impact analysis.
4. Following this, the minimum number of nights with such data for a candidate will be set arbitrarily to 100, providing a threshold for sufficient data.


In [None]:
from unicodedata import category
%load_ext autoreload
%autoreload 2

import pandas as pd

from src.config import INTERIM_DATA_DIR
from src.configurations import Configuration

config = Configuration()
file_name = '15min_iob_cob_bg.csv'
file_path = INTERIM_DATA_DIR / file_name
dtypes = {'id': int, 'iob count': int, 'cob count': int, 'bg count': int}
df = pd.read_csv(file_path, index_col=0, parse_dates=True, dtype=dtypes).drop(columns='system')

In [None]:
# How many full days does each individual have? A full day is defined as having at least 96 15-minute intervals.
full_days = df.groupby('id', group_keys=False).size() // 96
mask_full_days = full_days.sort_values(ascending=False) > 100
df = df.reset_index().set_index(['id','datetime']).sort_index(['id', 'datetime'])
mask = df.index.get_level_values('id').isin(mask_full_days[mask_full_days].index)
df = df[mask]
print(f'People with > 100 days in the dataset: {len(df.index.get_level_values("id").unique())}')

In [None]:
# Now check and remove anyone that is missing all data for any of IOB, COB, BG columns
check_cols = ['iob count', 'cob count', 'bg count']
ids_with_only_nans_or_zeros = {}
for col in check_cols:
    mask = (
        df.groupby('id')[col]
        .apply(lambda x: x.isna().all() or (x.fillna(0) == 0).all())
    )
    ids_with_only_nans_or_zeros[col] = mask[mask].index.tolist()
print(ids_with_only_nans_or_zeros)

ids = set()
for key, val in ids_with_only_nans_or_zeros.items():
    ids.update(set(val))
print(ids)
df = df[~df.index.get_level_values('id').isin(ids)]