In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

In [5]:
df = pd.read_csv('data/train.csv')


In [6]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
cutoff = pd.Timestamp("2016-05-20 17:00:00")
df_clean = df.loc[df["timestamp"] >= cutoff].copy()
df_clean = df_clean.query("building_id != 53").reset_index(drop=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698675 entries, 0 to 698674
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   building_id         698675 non-null  int64         
 1   timestamp           698675 non-null  datetime64[ns]
 2   meter_reading       698675 non-null  float64       
 3   primary_use         698675 non-null  object        
 4   square_feet         698675 non-null  int64         
 5   year_built          698675 non-null  int64         
 6   air_temperature     698452 non-null  float64       
 7   cloud_coverage      394159 non-null  float64       
 8   dew_temperature     698452 non-null  float64       
 9   precip_depth_1_hr   698591 non-null  float64       
 10  sea_level_pressure  691953 non-null  float64       
 11  wind_direction      678753 non-null  float64       
 12  wind_speed          698675 non-null  float64       
dtypes: datetime64[ns](1), float64

In [8]:
missing = df_clean.isnull().sum()

In [9]:
missing

building_id                0
timestamp                  0
meter_reading              0
primary_use                0
square_feet                0
year_built                 0
air_temperature            0
cloud_coverage        180152
dew_temperature            0
precip_depth_1_hr          0
sea_level_pressure      2191
wind_direction         13924
wind_speed                 0
dtype: int64

In [10]:
train_mask = df_clean.timestamp <  "2016-11-01"
val_mask   = (df_clean.timestamp >= "2016-11-01") & (df_clean.timestamp < "2016-12-01")
test_mask  = df_clean.timestamp >=  "2016-12-01"

df_clean["y_log"] = np.log1p(df_clean["meter_reading"])

num_cols = df_clean.columns.difference(
    ["building_id", "primary_use", "meter_reading", "y_log", "timestamp"]
)
cat_cols_onehot = ["primary_use"]   # one‑hot
cat_cols_lgbm   = ["building_id"]   # leave as category

X_raw = df_clean[num_cols.tolist() + cat_cols_onehot + cat_cols_lgbm].copy()
X_raw["building_id"] = X_raw["building_id"].astype("category")

y_raw = df_clean["y_log"].values

X_train_raw, y_train = X_raw[train_mask], y_raw[train_mask]
X_val_raw,   y_val   = X_raw[val_mask],   y_raw[val_mask]
X_test_raw,  y_test  = X_raw[test_mask],  y_raw[test_mask]

pre = ColumnTransformer(
    [("onehot", OneHotEncoder(handle_unknown="ignore"), cat_cols_onehot)],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

X_train = pre.fit_transform(X_train_raw)
X_val   = pre.transform(X_val_raw)
X_test  = pre.transform(X_test_raw)



In [11]:
lgbm = lgb.LGBMRegressor(
    objective          = "tweedie",
    tweedie_variance_power = 1.1,     # 1.0 ≈ Poisson, 2.0 ≈ Gamma – 1.1 works well for energy
    n_estimators       = 10000,      
    learning_rate      = 0.03,
    num_leaves         = 512,
    feature_fraction   = 0.85,
    bagging_fraction   = 0.85,
    bagging_freq       = 5,
    min_data_in_leaf   = 50,
    random_state       = 42,
    metric             = "rmse",     
)

print("Training baseline LightGBM …")
lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        early_stopping(stopping_rounds=2000, verbose=False),
        log_evaluation(period=200),
    ],
)

print(f"\nBest iteration              : {lgbm.best_iteration_}")
print(f"Validation RMSE (log-space) : {lgbm.best_score_['valid_0']['rmse']:.5f}")

y_hat_test_log = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)
# rmse_kwh = np.sqrt(((np.expm1(y_hat_test_log) - np.expm1(y_test))**2).mean())
rmse_kwh = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_hat_test_log)))



print(f"Test RMSE (kWh)             : {rmse_kwh:.2f}")


Training baseline LightGBM …
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 651
[LightGBM] [Info] Number of data points in the train set: 313826, number of used features: 17
[LightGBM] [Info] Start training from score 1.664781
[200]	valid_0's rmse: 0.470619
[400]	valid_0's rmse: 0.479531
[600]	valid_0's rmse: 0.485499
[800]	valid_0's rmse: 0.491106
[1000]	valid_0's rmse: 0.49542
[1200]	valid_0's rmse: 0.499635
[1400]	valid_0's rmse: 0.505517
[1600]	valid_0's rmse: 0.508488
[1800]	valid_0's rmse: 0.511592
[2000]	valid_0's rmse: 0.514782

Best iteration              : 118
Validation RMSE (log-space) : 0.46441
Test RMSE (kWh)             : 145.52


In [12]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dtest  = xgb.DMatrix(X_test,  label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "seed": 42,
    "tree_method": "hist",
}

print("Training baseline XGBoost …")
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    evals=[(dval, "val")],
    early_stopping_rounds=200,
    verbose_eval=200,
)

best_iter = bst.best_iteration
val_rmse_log = bst.best_score
print(f"\nBest iteration              : {best_iter}")
print(f"Validation RMSE (log‑space) : {val_rmse_log:.5f}")

y_hat_test_log = bst.predict(dtest)
rmse_kwh = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_hat_test_log)))

print(f"Test RMSE (kWh)             : {rmse_kwh:.2f}")

Training baseline XGBoost …
[0]	val-rmse:1.31369
[200]	val-rmse:0.46024
[281]	val-rmse:0.46513

Best iteration              : 82
Validation RMSE (log‑space) : 0.44373
Test RMSE (kWh)             : 152.17


In [13]:
from itertools import product
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

grid = {
    "n_estimators":     [300, 600, 1200],
    "learning_rate":    [0.03, 0.1],
    "max_depth":        [4, 6, 8],
    "subsample":        [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}
keys, values = zip(*grid.items())
param_sets = [dict(zip(keys, v)) for v in product(*values)]

best_params, best_rmse = None, float("inf")

for params in param_sets:
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist",
        **params
    )
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    val_rmse = rmse(y_val, val_pred)        # log‑space RMSE

    print("params", params, " -> val_RMSE_log {:.5f}".format(val_rmse))

    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_params = params

print("\nBest log-space RMSE on Validation Set :", "%.5f" % best_rmse)
print("Best params :", best_params)

X_trainval = np.vstack([X_train, X_val])
y_trainval = np.concatenate([y_train, y_val])

final_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    tree_method="hist",
    **best_params
)
final_model.fit(X_trainval, y_trainval)

test_pred_log = final_model.predict(X_test)


params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 4, 'subsample': 0.8, 'colsample_bytree': 0.8}  -> val_RMSE_log 0.47295
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 4, 'subsample': 0.8, 'colsample_bytree': 1.0}  -> val_RMSE_log 0.46900
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 0.8}  -> val_RMSE_log 0.47828
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 1.0}  -> val_RMSE_log 0.47552
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8}  -> val_RMSE_log 0.45325
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 1.0}  -> val_RMSE_log 0.46008
params {'n_estimators': 300, 'learning_rate': 0.03, 'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.8}  -> val_RMSE_log 0.45302
params {'n_estimators': 300, 'learning_rate': 0.

In [14]:
test_rmse_kwh = rmse(np.expm1(y_test), np.expm1(test_pred_log))
print("test RMSE (kWh): {:.2f}".format(test_rmse_kwh))


test RMSE (kWh): 128.66
