# XGBoost model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.style.use('seaborn')

# Data preparation

In [3]:
data = pd.read_csv('../../data/SamDysch_glucose_2-5-2022.csv', skiprows=[0])
data.index = pd.to_datetime(data['Device Timestamp'], format="%d-%m-%Y %H:%M")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# drop non-historic glucose records
data = data[data['Record Type'] == 0]

# only keep bg
to_keep = [
    'Historic Glucose mmol/L',
]
data = data[to_keep]

data = data.rename(columns={'Historic Glucose mmol/L': 'reading'})

data.head()

Unnamed: 0_level_0,reading
Device Timestamp,Unnamed: 1_level_1
2019-12-09 19:04:00,6.8
2019-12-09 19:20:00,7.6
2019-12-09 19:35:00,7.7
2019-12-09 19:50:00,7.2
2019-12-09 20:05:00,5.8


In [5]:
# drop NaNs
data = data.dropna()

# Setup hypo threshold

In [6]:
HYPO_THRESHOLD = 3.9
data['is_hypo'] = (data['reading'] < HYPO_THRESHOLD).astype(int)

In [7]:
# adding some time variables
data['hour'] = data.index.hour
data['day'] = data.index.dayofweek
data['month'] = data.index.month

# OneHotEncode hours

In [8]:
data = pd.get_dummies(data, prefix='hour', columns=['hour'])
print(data.columns)

Index(['reading', 'is_hypo', 'day', 'month', 'hour_0', 'hour_1', 'hour_2',
       'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9',
       'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
       'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21',
       'hour_22', 'hour_23'],
      dtype='object')


# creating a lagged and rolling variables
* Was I hypo 15 mins ago? 30 mins ago? Etc
* Rolling average of last N readings
* Sign of gradient of last N readings:
    * I.e., is BG rising, falling, or stable?
    
## Lagged features

In [9]:
# create lags
# To ensure that we do not make a lag between periods of sensor non-usage, create a new df with the lagged indices & merge onto original data frame
def create_lag(df, lag):
    tolerance = 15 * lag
    freq = '15min'
    print(f'Creating lag of {tolerance} minutes')
    lagged_copy = df[['reading']].shift(lag, freq=freq)
    lagged_copy.rename(columns={'reading': f'lagged_reading_{lag}'}, inplace=True)
    
    merged = pd.merge_asof(df, lagged_copy, left_index=True, right_index=True, direction='backward', tolerance=pd.Timedelta(minutes=tolerance))
    # merged = pd.merge_asof(copy, lagged_copy, left_index=True, right_index=True, direction='backward')
    return merged

NLAGS = 8
for lag in range(1, NLAGS):
    data = create_lag(data, lag)

Creating lag of 15 minutes
Creating lag of 30 minutes
Creating lag of 45 minutes
Creating lag of 60 minutes
Creating lag of 75 minutes
Creating lag of 90 minutes
Creating lag of 105 minutes


In [10]:
# For ease of variable calculation, drop the nans
data = data.dropna()

In [11]:
# lagged hypo bools
for lag in range(1, NLAGS):
    data[f'is_lagged_hypo_{lag}'] = (data[f'lagged_reading_{lag}'] < HYPO_THRESHOLD).astype(int)

## Rolling features

In [12]:
# simple differences of lags - was reading higher, lower, or stable?
for lag in range(2, NLAGS):
    data[f'diff_{lag}'] = data['lagged_reading_1'] - data[f'lagged_reading_{lag}']

# gradients - how quick is BG changing?
interval = 15
for lag in range(2, NLAGS):
    data[f'rate_{lag}'] = data[f'diff_{lag}'] / (interval * lag)

## train, test, validation split

In [13]:
TRAIN_SPLIT = 0.65
VAL_SPLIT = 0.2
TEST_SPLIT = 0.15

In [14]:
itrain = int(TRAIN_SPLIT * len(data))
ival = int(VAL_SPLIT * len(data))
itest = int(TEST_SPLIT * len(data))

train_data = data.iloc[:itrain]
val_data = data.iloc[itrain:itrain + ival]
test_data = data.iloc[itrain + ival:]

# Variable selection

In [15]:
rates_and_diffs = [f'diff_{v}' for v in range(2, NLAGS)]
rates_and_diffs.extend([f'rate_{v}' for v in range(2, NLAGS)])

# to fairly compare with baseline, drop any historical variables with time delta < 45 mins
vars_to_drop = [
    'month',
    'day',
    'reading',
    'is_lagged_hypo_1',
    'is_lagged_hypo_2',
    'lagged_reading_1',
    'lagged_reading_2',
]
vars_to_drop.extend(rates_and_diffs)

train_data = train_data.drop(vars_to_drop, axis='columns')
val_data = val_data.drop(vars_to_drop, axis='columns')
test_data = test_data.drop(vars_to_drop, axis='columns')

print(train_data.columns)
print(val_data.columns)
print(test_data.columns)

Index(['is_hypo', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'lagged_reading_3', 'lagged_reading_4', 'lagged_reading_5',
       'lagged_reading_6', 'lagged_reading_7', 'is_lagged_hypo_3',
       'is_lagged_hypo_4', 'is_lagged_hypo_5', 'is_lagged_hypo_6',
       'is_lagged_hypo_7'],
      dtype='object')
Index(['is_hypo', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'lagged_reading_3', 'lagged_reading_4', 'lagged_reading_5',
       'lagged_reading_6', 'lagged_reading_7', 'is_lagged_hypo_3',
       'is_lagged_hypo_4', 'i

In [16]:
target = 'is_hypo'

X_train = train_data.drop([target], axis='columns')
y_train = train_data[target]

X_val = val_data.drop(target, axis='columns')
y_val = val_data[target]

X_test = test_data.drop(target, axis='columns')
y_test = test_data[target]

print(X_train.columns)

Index(['hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6',
       'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'lagged_reading_3', 'lagged_reading_4', 'lagged_reading_5',
       'lagged_reading_6', 'lagged_reading_7', 'is_lagged_hypo_3',
       'is_lagged_hypo_4', 'is_lagged_hypo_5', 'is_lagged_hypo_6',
       'is_lagged_hypo_7'],
      dtype='object')


In [17]:
# redefine train = train + validation for cross validation
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

# objective function setup

In [18]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from xgboost import XGBClassifier
import optuna

def objective(trial):
    
    # hyperparameter space
    n_estimators = trial.suggest_int('n_estimators', 10, 300)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    eta = trial.suggest_categorical('eta', [0.1, 0.15, 0.2, 0.3])
    # gamma = trial.suggest_int('gamma', 0, 5)

    
    # define model
    model = XGBClassifier(
        #verbosity=2,
        n_estimators=n_estimators,
        eta=eta,
        gamma=0,
        max_depth=max_depth,
        reg_lambda=1,
        reg_alpha=0,
        subsample=0.5,
        objective='binary:logistic'   
    )
    
    # fit and evaluate model
    ftwo_scorer = make_scorer(fbeta_score, beta=2)
    splits = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X_train, y_train, cv=splits, scoring=ftwo_scorer)
    
    return np.mean(scores)

# run optuna trials

In [19]:
# optimise
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

[32m[I 2022-06-19 11:04:36,207][0m A new study created in memory with name: no-name-b9a5f4f3-7c4d-4b32-b969-c28fcf20839b[0m
[32m[I 2022-06-19 11:04:37,221][0m Trial 0 finished with value: 0.2913762094069756 and parameters: {'n_estimators': 29, 'max_depth': 2, 'eta': 0.2}. Best is trial 0 with value: 0.2913762094069756.[0m
[32m[I 2022-06-19 11:04:40,735][0m Trial 1 finished with value: 0.41857048940523855 and parameters: {'n_estimators': 63, 'max_depth': 7, 'eta': 0.3}. Best is trial 1 with value: 0.41857048940523855.[0m
[32m[I 2022-06-19 11:04:45,162][0m Trial 2 finished with value: 0.4190965889234852 and parameters: {'n_estimators': 86, 'max_depth': 6, 'eta': 0.3}. Best is trial 2 with value: 0.4190965889234852.[0m


In [21]:
# print(f'Best trial: {study.best_trial}')
print(f'Best value: {study.best_value}')
print(f'Best parameters: {study.best_params}')

Best value: 0.4190965889234852
Best parameters: {'n_estimators': 86, 'max_depth': 6, 'eta': 0.3}
