In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from pathlib import Path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from functools import reduce


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from statsmodels.nonparametric.smoothers_lowess import lowess
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
paths = list(Path('/kaggle/input/smart-building-system/KETI/').rglob('*.*'))
light_paths = [path for path in paths if path.name == 'light.csv']
temperature_paths = [path for path in paths if path.name == 'temperature.csv']
co2_paths = [path for path in paths if path.name == 'co2.csv']
pir_paths = [path for path in paths if path.name == 'pir.csv']
humidity_paths = [path for path in paths if path.name == 'humidity.csv']

dfs = []
for light_path, temperature_path, co2_path, pir_path, humidity_path in zip(light_paths, temperature_paths, co2_paths, pir_paths, humidity_paths):
    light_df = pd.read_csv(light_path, names=['unix_time', 'light'], index_col='unix_time')
    temperature_df = pd.read_csv(temperature_path, names=['unix_time', 'temperature'], index_col='unix_time')
    co2_df = pd.read_csv(co2_path, names=['unix_time', 'co2'], index_col='unix_time')
    pir_df = pd.read_csv(pir_path, names=['unix_time', 'pir'], index_col='unix_time')
    humidity_df = pd.read_csv(humidity_path, names=['unix_time', 'humidity'], index_col='unix_time')
    df = pd.concat([light_df, temperature_df, co2_df, pir_df, humidity_df], axis=1)
    df['room'] = light_path.parent.name
    dfs.append(df)
df = pd.concat(dfs)

In [None]:
df.info()

In [None]:
df.isnull().sum() / len(df)

In [None]:
# I drop missing pirs so that we only have data collected every 10 seconds, instead of a mixture of 10s and 5s. An alternative would have been to do a left join with the pir dataframe so that we only
# have times where pir is populated.
df = df.dropna(subset=['pir'])

In [None]:
# we're assuming 0 means no occupants and everything above 1 means occupancy
# alternatively I could leave this target as is, and do regression, however, I don't want to.
y = df['pir']
y = y.apply(lambda x: 0 if x == 0 else 1)
X = df.drop(['pir'], axis=1)

In [None]:
X['room'] = X['room'].astype('category')  # lightgbm can handle categories

In [None]:
Xt, Xv, yt, yv = train_test_split(X.sample(10000, random_state=0), y.sample(10000, random_state=0), random_state=0)
dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)

In [None]:
import optuna
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "eta": trial.suggest_float("eta", 1e-4, 1, log=False),
    }

    model = lgb.train(param, dt,
        num_boost_round=10000,
        valid_sets=[dt, dv],
        valid_names=["training", "valid"],
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    return model.best_score['valid']['binary_logloss']


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
best_eta = trial.params['eta']

In [None]:
model = lgb.train(
    {"objective": "binary", "eta": best_eta},
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=10,
)

In [None]:
lgb.plot_importance(model)

In [None]:
corr = Xt.corr(method="kendall")
corr = corr.stack()
corr = corr.loc[
    [tup for tup in corr.index if tup[0] != tup[1]]
]  # remove pairs of itself
threshold = 0.75
high_corr = corr[(abs(corr) > threshold)]
abs_high_corr = abs(high_corr)[::2]
pairs = abs_high_corr.sort_values(ascending=False).index.to_list()
print(f"Correlated features: {pairs if len(pairs) > 0 else None}")

In [None]:
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(), model.feature_name()), reverse=False
    )
]

In [None]:
metric = 'binary_logloss'
best_score = model.best_score["valid"][metric]
print(f"starting score: {best_score:.4f}")
unimportant_features = []
for feature in sorted_features:
    unimportant_features.append(feature)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X.sample(10000, random_state=0).drop(unimportant_features, axis=1), y.sample(10000, random_state=0), random_state=0
    )
    dt = lgb.Dataset(X_train, y_train)
    dv = lgb.Dataset(X_valid, y_valid)
    drop_model = lgb.train(
        {"objective": "binary", "eta": best_eta},
        dt,
        valid_sets=[dt, dv],
        valid_names=["training", "valid"],
        num_boost_round=10000,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    score = drop_model.best_score["valid"][metric]
    if score > best_score:
        del unimportant_features[-1]  # remove from drop list
        print(f"Dropping {feature} worsened score to {score:.4f}.")
        break
    else:
        best_score = score
print(f"ending score: {best_score:.4f}")
print(
    f"dropped features: {unimportant_features if len(unimportant_features) > 0 else None}"
)

In [None]:
import optuna.integration.lightgbm as lgb

metric = 'binary_logloss'

dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)

params = {
    "objective": "binary",
    "metric": metric,
    "verbosity": -1,
    "boosting_type": "gbdt",
    "eta": best_eta,
}

model = lgb.train(
    params,
    dt,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    num_boost_round=10000,
    verbose_eval=False,
    early_stopping_rounds=50,
)

score = model.best_score["valid"][metric]

best_params = model.params
print("Best params:", best_params)
print(f"  {metric} = {score}")
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
import lightgbm as lgb

In [None]:
best_params['eta'] = best_eta

In [None]:
Xt, Xv, yt, yv = train_test_split(X, y, random_state=0)
dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)
model = lgb.train(
    best_params,
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=10,
)

In [None]:
lgb.plot_importance(model, grid=False)

In [None]:
dt.feature_name

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X.sample(10000, random_state=0))
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(), dt.feature_name), reverse=True
    )
]
# to make sense of it all
for name in sorted_features:
    shap.dependence_plot(name, shap_values[1], X.sample(10000, random_state=0))