In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import time
import catboost
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score

from data import load_rooms
from features import extract_features


# Preparation

In [4]:
rooms = load_rooms()

model = catboost.CatBoostClassifier(iterations=100, verbose=False, custom_loss=[catboost.metrics.F1()], random_seed=1, random_strength=0, rsm=1, has_time=True, bootstrap_type='No')


In [5]:
def train_eval(model, df_data, window_size='60min', shifts=None):
    # Extract features.
    df_features, target_names, input_names = extract_features(df_data, window_size, shifts)

    # Drop NaN.
    df_features = df_features.dropna()

    # Split X/y.
    df_X = df_features[input_names]
    df_y = df_features[target_names]

    # Train test split.
    train_size = round(len(df_X)*0.66)

    df_X_train = df_X.iloc[:train_size]
    df_y_train = df_y.iloc[:train_size]
    df_X_test = df_X.iloc[train_size:]
    df_y_test = df_y.iloc[train_size:]

    # Prepare input.
    scaler = StandardScaler()
    scaler.fit(df_X_train)
    X_train = scaler.transform(df_X_train)
    X_test = scaler.transform(df_X_test)

    # Prepare target.
    y_train = df_y_train.values.ravel().astype(np.uint8)
    y_test = df_y_test.values.ravel().astype(np.uint8)

    # Fit model.
    model.fit(X_train, y_train)

    # Predict train data and calculate score.
    train_pred = model.predict(X_train)
    train_ba = balanced_accuracy_score(y_train, train_pred)

    # Predict test data and calculate score.
    test_pred = model.predict(X_test)
    test_ba = balanced_accuracy_score(y_test, test_pred)

    return round(train_ba*100, 1), round(test_ba*100, 1)


# Baseline

In [6]:
d = {}
for room, data in rooms.items():
    d[room] = pd.DataFrame(
        columns=['label', 'Train BA', 'Validation BA'],
        data=[['Baseline', *train_eval(model, data)]]
    ).set_index('label')

df_baseline_results = pd.concat(d, axis=1)
df_baseline_results


Unnamed: 0_level_0,Office L1,Office L1,Office L2,Office L2,Office S3,Office S3,Home 1,Home 1
Unnamed: 0_level_1,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Baseline,79.3,67.6,82.0,59.6,57.3,52.3,83.9,65.3


# Temporal Shift Features

In [7]:
experiments = {
    'No shift features': [],
    '+1 hour ago': [-6],
    '+2 hours ago': [-6, -12],
    '+3 hours ago': [-6, -12, -18],
    '+4 hours ago': [-6, -12, -18, -24],
    '+5 hours ago': [-6, -12, -18, -24, -30],
    '+6 hours ago': [-6, -12, -18, -24, -30, -36],
    '+7 hours ago': [-6, -12, -18, -24, -30, -36, -42],
    '+8 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48],
    '+9 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48, -54],
    '+10 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48, -54, -60],
    '+11 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48, -54, -60, -66],
    '+12 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48, -54, -60, -66, -72],
}

d = {}
for room, data in rooms.items():
    d[room] = pd.DataFrame(
        columns=['label', 'Train BA', 'Validation BA'],
        data=[[label, *train_eval(model, data, shifts=shifts)] for label, shifts in experiments.items()]
    ).set_index('label')

df_shifts_results = pd.concat(d, axis=1)
df_shifts_results


Unnamed: 0_level_0,Office L1,Office L1,Office L2,Office L2,Office S3,Office S3,Home 1,Home 1
Unnamed: 0_level_1,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
No shift features,79.3,67.6,82.0,59.6,57.3,52.3,83.9,65.3
+1 hour ago,83.4,71.6,85.0,63.3,62.1,54.8,85.8,65.0
+2 hours ago,86.0,74.0,86.8,65.0,64.4,55.3,86.7,65.7
+3 hours ago,87.7,74.1,88.5,65.5,65.5,55.7,88.1,67.2
+4 hours ago,88.6,74.0,89.6,66.3,67.1,56.8,88.6,66.7
+5 hours ago,89.2,75.7,90.2,66.1,69.2,56.3,89.2,67.1
+6 hours ago,89.6,75.7,90.9,67.3,69.4,56.4,90.3,66.1
+7 hours ago,90.7,76.6,91.3,68.5,71.1,56.7,89.8,66.5
+8 hours ago,91.1,77.4,91.5,69.9,72.4,56.3,90.0,66.1
+9 hours ago,94.5,77.4,95.9,69.5,83.6,55.3,94.2,66.9


In [8]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(13)), y=df_shifts_results['Office L1']['Validation BA'], mode='lines+markers', name='Office L1'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_shifts_results['Office L2']['Validation BA'], mode='lines+markers', name='Office L2'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_shifts_results['Office S3']['Validation BA'], mode='lines+markers', name='Office S3'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_shifts_results['Home 1']['Validation BA'], mode='lines+markers', name='Home 1'))
fig.update_layout(
    xaxis_title='Hours of historical shift features (cumulative)',
    xaxis_dtick=1,
    xaxis_range=[0, 12],
    yaxis_title='Balanced accuracy',
    yaxis_range=[50, 85],
    width=500,
    height=500,
    margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
    legend_orientation='h',
    legend_xanchor='center',
    legend_x=0.5,
)
fig.show()

# Retrospective Analysis

In [9]:
experiments = {
    'Up to 8 hours ago': [-6, -12, -18, -24, -30, -36, -42, -48],
    '+1 hour later': [-6, -12, -18, -24, -30, -36, -42, -48, 6],
    '+2 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12],
    '+3 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18],
    '+4 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24],
    '+5 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30],
    '+6 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36],
    '+7 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42],
    '+8 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48],
    '+9 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48, 54],
    '+10 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60],
    '+11 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66],
    '+12 hours later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72],
}

d = {}
for room, data in rooms.items():
    d[room] = pd.DataFrame(
        columns=['label', 'Train BA', 'Validation BA'],
        data=[[label, *train_eval(model, data, shifts=shifts)] for label, shifts in experiments.items()]
    ).set_index('label')

df_retro_results = pd.concat(d, axis=1)
df_retro_results


Unnamed: 0_level_0,Office L1,Office L1,Office L2,Office L2,Office S3,Office S3,Home 1,Home 1
Unnamed: 0_level_1,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Up to 8 hours ago,91.1,77.4,91.5,69.9,72.4,56.3,90.0,66.1
+1 hour later,96.7,80.4,98.1,71.8,85.0,57.9,95.9,67.0
+2 hours later,97.1,80.3,98.5,71.6,87.3,59.3,96.0,66.9
+3 hours later,97.4,80.5,98.3,71.4,87.6,59.2,96.4,68.1
+4 hours later,97.5,79.8,98.7,70.4,89.3,61.0,96.7,68.1
+5 hours later,97.8,80.7,98.6,72.4,90.2,59.9,96.6,68.7
+6 hours later,97.8,80.4,98.8,71.9,90.9,60.7,96.7,68.7
+7 hours later,97.7,81.2,98.9,70.9,91.3,60.8,97.0,69.5
+8 hours later,97.7,79.0,99.0,69.4,92.7,60.0,96.8,69.4
+9 hours later,97.8,78.6,99.1,68.2,91.0,60.1,96.7,70.1


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(13)), y=df_retro_results['Office L1']['Validation BA'], mode='lines+markers', name='Office L1'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_retro_results['Office L2']['Validation BA'], mode='lines+markers', name='Office L2'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_retro_results['Office S3']['Validation BA'], mode='lines+markers', name='Office S3'))
fig.add_trace(go.Scatter(x=list(range(13)), y=df_retro_results['Home 1']['Validation BA'], mode='lines+markers', name='Home 1'))
fig.update_layout(
    xaxis_title='Hours of future shift features (cumulative)',
    xaxis_dtick=1,
    xaxis_range=[0, 12],
    yaxis_title='Balanced accuracy',
    yaxis_range=[50, 85],
    width=500,
    height=500,
    margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
    legend_orientation='h',
    legend_xanchor='center',
    legend_x=0.5,
)
fig.show()

# Feature Reduction

In [11]:
experiments = {
    'TSF up to 8 hours ago+later': [-6, -12, -18, -24, -30, -36, -42, -48, 6, 12, 18, 24, 30, 36, 42, 48],
    'TSF 1/2/3/4/5/7/8 hours ago+later': [-6, -12, -18, -24, -30, -42, -48, 6, 12, 18, 24, 30, 42, 48],
    'TSF 1/2/4/5/7/8 hours ago+later': [-6, -12, -24, -30, -42, -48, 6, 12, 24, 30, 42, 48],
    'TSF 1/2/4/7/8 hours ago+later': [-6, -12, -24, -42, -48, 6, 12, 24, 42, 48],
    'TSF 1/2/4/8 hours ago+later': [-6, -12, -24, -48, 6, 12, 24, 48],
    'TSF 1/4/8 hours ago+later': [-6, -24, -48, 6, 24, 48]
}

d = {}
for room, data in rooms.items():
    d[room] = pd.DataFrame(
        columns=['label', 'Train BA', 'Validation BA'],
        data=[[label, *train_eval(model, data, shifts=shifts)] for label, shifts in experiments.items()]
    ).set_index('label')

df_reduced_results = pd.concat(d, axis=1)
df_reduced_results


Unnamed: 0_level_0,Office L1,Office L1,Office L2,Office L2,Office S3,Office S3,Home 1,Home 1
Unnamed: 0_level_1,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
TSF up to 8 hours ago+later,97.7,79.0,99.0,69.4,92.7,60.0,96.8,69.4
TSF 1/2/3/4/5/7/8 hours ago+later,97.8,80.2,98.8,69.8,91.0,61.5,96.8,69.7
TSF 1/2/4/5/7/8 hours ago+later,97.6,78.9,98.9,69.0,90.1,60.8,96.8,69.0
TSF 1/2/4/7/8 hours ago+later,97.1,80.6,98.8,69.5,89.7,61.0,96.9,69.5
TSF 1/2/4/8 hours ago+later,94.6,80.7,95.9,69.5,78.8,60.6,93.2,70.4
TSF 1/4/8 hours ago+later,94.6,79.4,95.6,67.6,76.8,59.8,93.1,70.1


# Model Iterations

In [None]:
experiments = {
    '25': 25,
    '50': 50,
    '75': 75,
    '100': 100,
    '125': 125,
    '150': 150,
    '175': 175,
    '200': 200
}

d = {}
for room, data in rooms.items():
    data_list = []
    for label, iter in experiments.items():
        model_iter = catboost.CatBoostClassifier(iterations=iter, verbose=False, custom_loss=[catboost.metrics.F1()], random_seed=1, random_strength=0, rsm=1, has_time=True, bootstrap_type='No')

        train_ba, validation_ba = train_eval(model_iter, data, shifts=[-6, -12, -24, -48, 6, 12, 24, 48])
        data_list.append([label, train_ba, validation_ba])

    d[room] = pd.DataFrame(
        columns=['label', 'Train BA', 'Validation BA'],
        data=data_list
    ).set_index('label')

df_iterations_results = pd.concat(d, axis=1)
df_iterations_results


Unnamed: 0_level_0,Office L1,Office L1,Office L2,Office L2,Office S3,Office S3,Home 1,Home 1
Unnamed: 0_level_1,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA,Train BA,Validation BA
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
25,90.6,79.4,93.0,67.2,70.1,59.6,90.4,70.0
50,93.4,78.6,95.3,68.1,76.3,59.4,93.4,70.9
75,94.6,79.9,96.2,69.0,78.5,59.4,93.5,70.3
100,94.6,80.7,95.9,69.5,78.8,60.6,93.2,70.4
125,94.7,80.0,96.4,69.4,79.0,60.5,93.7,70.4
150,94.9,80.4,96.1,69.3,78.6,60.8,93.6,70.3
175,94.8,79.4,96.3,69.9,78.5,60.9,93.7,70.4
200,97.9,79.8,99.0,68.8,89.4,61.9,97.0,70.1


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_iterations_results.index.astype(int), y=df_iterations_results['Office L1']['Validation BA'], mode='lines+markers', name='Office L1'))
fig.add_trace(go.Scatter(x=df_iterations_results.index.astype(int), y=df_iterations_results['Office L2']['Validation BA'], mode='lines+markers', name='Office L2'))
fig.add_trace(go.Scatter(x=df_iterations_results.index.astype(int), y=df_iterations_results['Office S3']['Validation BA'], mode='lines+markers', name='Office S3'))
fig.add_trace(go.Scatter(x=df_iterations_results.index.astype(int), y=df_iterations_results['Home 1']['Validation BA'], mode='lines+markers', name='Home 1'))
fig.update_layout(
    xaxis_title='Number of CatBoost iterations',
    xaxis_dtick=25,
    xaxis_range=[25, 200.6],
    yaxis_title='Balanced accuracy',
    yaxis_range=[50, 85],
    width=500,
    height=500,
    margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
    legend_orientation='h',
    legend_xanchor='center',
    legend_x=0.5,
)
fig.show()

# Training Time

In [6]:
run_times = []

for i in range(10):
    for room, data in rooms.items():
        start = time.perf_counter()
        train_eval(model, data, shifts=[-6, -12, -24, -48, 6, 12, 24, 48])
        run_times.append((time.perf_counter() - start) * 1000)

print('Mean run time (ms):', np.mean(run_times).round())
print('Std run time (ms):', np.std(run_times).round())


Mean run time (ms): 3249.0
Std run time (ms): 247.0
