In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import catboost
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score

from data import load_rooms
from features import extract_features, sliding_window_normalization


# Preparation

In [4]:
rooms = load_rooms()

model = catboost.CatBoostClassifier(iterations=100, verbose=False, custom_loss=[catboost.metrics.F1()], random_seed=1, random_strength=0, rsm=1, has_time=True, bootstrap_type='No')

SHIFTS = [-6, -12, -24, -48, 6, 12, 24, 48]


In [5]:
def train_eval(model, df_data_train, df_data_test, window_size='60min', shifts=None, normalization=None):
    # Extract features.
    df_features_train, target_names, input_names = extract_features(df_data_train, window_size, shifts)
    df_features_test, target_names, input_names = extract_features(df_data_test, window_size, shifts)

    # Sliding window normalization.
    # Before drop NaN so we can also use historical sensor data without ground truth.
    if normalization == 'SLIDING':
        df_features_train[input_names] = sliding_window_normalization(df_features_train[input_names], 30)
        df_features_test[input_names] = sliding_window_normalization(df_features_test[input_names], 30)

    # Drop NaN.
    df_features_train = df_features_train.dropna()
    df_features_test = df_features_test.dropna()

    # Split X/y.
    df_X_train = df_features_train[input_names]
    df_y_train = df_features_train[target_names]

    df_X_test = df_features_test[input_names]
    df_y_test = df_features_test[target_names]

    # Prepare input.
    if normalization == 'STANDARD':
        scaler = StandardScaler()
        scaler.fit(df_X_train)
        X_train = scaler.transform(df_X_train)
        X_test = scaler.transform(df_X_test)
    else:
        X_train = df_X_train.values
        X_test = df_X_test.values

    # Prepare target.
    y_train = df_y_train.values.ravel().astype(np.uint8)
    y_test = df_y_test.values.ravel().astype(np.uint8)

    # Fit model.
    model.fit(X_train, y_train)

    # Predict train data and calculate score.
    train_pred = model.predict(X_train)
    train_ba = balanced_accuracy_score(y_train, train_pred)

    # Predict test data and calculate score.
    test_pred = model.predict(X_test)
    test_ba = balanced_accuracy_score(y_test, test_pred)

    return round(train_ba*100, 1), round(test_ba*100, 1)


# Direct Approach

In [6]:
d = {}
for room_train, data_train in rooms.items():
    d[room_train] = [
        train_eval(model, data_train, data_eval, shifts=SHIFTS, normalization='STANDARD')[1]
        if room_train != room_eval else 'X'
        for room_eval, data_eval
        in rooms.items()
    ]

df_direct_results = pd.DataFrame.from_dict(data=d, columns=[f'Validation BA on {room}' for room in rooms.keys()], orient='index')
df_direct_results


Unnamed: 0,Validation BA on Office L1,Validation BA on Office L2,Validation BA on Office S3,Validation BA on Home 1
Office L1,X,81.4,55.7,65.4
Office L2,63.0,X,51.1,75.7
Office S3,62.0,69.8,X,70.6
Home 1,50.0,53.5,50.0,X


# Sliding Window Normalization Approach

In [7]:
d = {}
for room_train, data_train in rooms.items():
    d[room_train] = [
        train_eval(model, data_train, data_eval, shifts=SHIFTS, normalization='SLIDING')[1]
        if room_train != room_eval else 'X'
        for room_eval, data_eval
        in rooms.items()
    ]

df_sliding_results = pd.DataFrame.from_dict(data=d, columns=[f'Validation BA on {room}' for room in rooms.keys()], orient='index')
df_sliding_results


Unnamed: 0,Validation BA on Office L1,Validation BA on Office L2,Validation BA on Office S3,Validation BA on Home 1
Office L1,X,84.6,66.8,80.6
Office L2,83.4,X,64.1,78.0
Office S3,60.1,62.2,X,65.7
Home 1,71.4,70.5,64.5,X
