In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import catboost
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from data import load_rooms
from features import extract_features, sliding_window_normalization

# Preparation

In [4]:
rooms = load_rooms()

model = catboost.CatBoostClassifier(iterations=100, verbose=False, custom_loss=[catboost.metrics.F1()], random_seed=1, random_strength=0, rsm=1, has_time=True, bootstrap_type='No')

SHIFTS = [-6, -12, -24, -48, 6, 12, 24, 48]

PROFILE_WINDOW_SIZE = 7*4*4 # Approximately 4 months.


In [5]:
# Extract features.
df_features_train, target_names, input_names = extract_features(rooms['Office L1'], shifts=SHIFTS)
df_features_test, target_names, input_names = extract_features(rooms['Office L2'], shifts=SHIFTS)

# Sliding window normalization.
df_features_train[input_names] = sliding_window_normalization(df_features_train[input_names], 30)
df_features_test[input_names] = sliding_window_normalization(df_features_test[input_names], 30)

# Drop NaN.
df_features_train = df_features_train.dropna()
df_features_test = df_features_test.dropna()

# Split X/y.
df_X_train = df_features_train[input_names]
df_y_train = df_features_train[target_names]

df_X_test = df_features_test[input_names]
df_y_test = df_features_test[target_names]

# Prepare input.
X_train = df_X_train.values
X_test = df_X_test.values

# Prepare target.
y_train = df_y_train.values.ravel().astype(np.uint8)
y_test = df_y_test.values.ravel().astype(np.uint8)

# Fit model.
model.fit(X_train, y_train)

# Predict test data.
test_pred = model.predict(X_test)

# Average workday

In [6]:
# Build dataframe.
df_profile = df_y_test.rename(columns={'presence__last__w=10m': 'presence_gt'}).copy()
df_profile['presence_predicted'] = test_pred

# Prepare columns for grouping.
df_profile['workday'] = df_profile.index.weekday.isin([0, 1, 2, 3, 4]) # Monday = 0
df_profile['hour'] = df_profile.index.hour
df_profile['minute'] = df_profile.index.minute

# Group and perform rolling average per group.
df_profile = df_profile.groupby(['workday', 'hour', 'minute']).transform(lambda s: s.rolling(f'{PROFILE_WINDOW_SIZE}D').mean())

# resample.
df_profile = df_profile.resample('30min', label='left').mean()

# Calculate MAE of predicted profile with respect to ground truth profile.
print('MAE:', (df_profile['presence_gt'] - df_profile['presence_predicted']).abs().mean().round(3), '±', (df_profile['presence_gt'] - df_profile['presence_predicted']).abs().std().round(3))

MAE: 0.043 ± 0.079


In [7]:
# Select example day for visualization.
df_profile_workday = df_profile[(df_profile.index > '2022-09-09') & (df_profile.index < '2022-09-10')]

# Visualize ground truth and predicted profile.
for title, column in {'Based on Ground Truth Presence': 'presence_gt', 'Based on Predicted Presence': 'presence_predicted'}.items():
    fig = make_subplots(rows=1, cols=1)

    fig.add_trace(
        go.Bar(
            name='profile',
            x=df_profile_workday.index,
            y=df_profile_workday[column],
        ),
        row=1,
        col=1,
    )

    fig.update_yaxes(title_text='Presence Probability', range=[0, 1])
    fig.update_layout(title_text=title, title_x=0.5, margin={'t': 50, 'r': 0, 'b': 0, 'l': 0}, width=600, height=300)
    fig.show()


# Average day (Monday to Friday)

In [8]:
# Build dataframe.
df_profile = df_y_test.rename(columns={'presence__last__w=10m': 'presence_gt'}).copy()
df_profile['presence_predicted'] = test_pred

# Prepare columns for grouping.
df_profile['day'] = df_profile.index.weekday # Monday = 0
df_profile['hour'] = df_profile.index.hour
df_profile['minute'] = df_profile.index.minute

# Group and perform rolling average per group.
df_profile = df_profile.groupby(['day', 'hour', 'minute']).transform(lambda s: s.rolling(f'{PROFILE_WINDOW_SIZE}D').mean())

# resample.
df_profile = df_profile.resample('30min', label='left').mean()

# Calculate MAE of predicted profile with respect to ground truth profile.
print('MAE:', (df_profile['presence_gt'] - df_profile['presence_predicted']).abs().mean().round(3), '±', (df_profile['presence_gt'] - df_profile['presence_predicted']).abs().std().round(3))


MAE: 0.052 ± 0.103


In [9]:
# Select example week for visualization.
df_profile_week = df_profile[(df_profile.index > '2022-09-05') & (df_profile.index < '2022-09-12')]

# Visualize ground truth and predicted profile.
for title, column in {'Based on Ground Truth Presence': 'presence_gt', 'Based on Predicted Presence': 'presence_predicted'}.items():
    fig = make_subplots(rows=1, cols=1)

    fig.add_trace(
        go.Bar(
            name='profile',
            x=df_profile_week.index,
            y=df_profile_week[column],
        ),
        row=1,
        col=1,
    )

    fig.update_yaxes(title_text='Presence Probability', range=[0, 1])
    fig.update_layout(title_text=title, title_x=0.5, margin={'t': 50, 'r': 0, 'b': 0, 'l': 0}, width=1000, height=300)
    fig.show()
