In [26]:
import datetime
from pathlib import Path
from lightgbm import LGBMRegressor
import polars as pl
from sklearn.metrics import mean_squared_error

DATA_DIR = Path("../data")

import src.lgb as lgb

In [27]:
lgb.main()

3896.71370


In [6]:
def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        pl.col("date").alias("date"),
        pl.col("date").dt.year().alias("year"),
        pl.col("date").dt.quarter().alias("quarter"),
        pl.col("date").dt.month().alias("month"),
        pl.col("date").dt.week().alias("week_of_year"),
        pl.col("date").dt.hour().alias("hour"),
        pl.col("date").dt.weekday().alias("day_of_week"),
        pl.col("date").dt.day().alias("day_of_month"),
        pl.col("date").dt.ordinal_day().alias("day_of_year"),
    )
    return df

In [14]:
df = pl.read_csv(DATA_DIR / "PJME_hourly.csv")
df = df.with_columns(pl.col("Datetime").str.to_datetime()).rename(
    {"Datetime": "date"}
)
df = feature_engineering(df)

split_date = datetime.datetime(2015, 1, 1)
train = df.filter(pl.col("date") <= split_date)
test = df.filter(pl.col("date") > split_date)

In [20]:
test.drop("date", "PJME_MW").shape

(31439, 8)

In [21]:
train.shape

(113927, 10)

In [22]:
model = LGBMRegressor()
model.fit(train.drop("date", "PJME_MW"), train["PJME_MW"])
pred = model.predict(test.drop("date", "PJME_MW"))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 113927, number of used features: 8
[LightGBM] [Info] Start training from score 32289.301412


In [24]:
mean_squared_error(y_true=test["PJME_MW"], y_pred=pred, squared=False)

3896.7137039323848