In [None]:
import pandas as pd
import matplotlib.pyplot as plt

https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set?resource=download

In [None]:
df = (
    pd.read_csv(
        "/Users/sofeikov/Downloads/household_power_consumption.txt.zip",
        low_memory=False,
        parse_dates={"datetime": ["Date", "Time"]},
        na_values=["?"],
        delimiter=";",
    )
    .set_index("datetime")
    .drop(
        columns=[
            "Global_reactive_power",
            "Voltage",
            "Global_intensity",
            "Sub_metering_1",
            "Sub_metering_2",
            "Sub_metering_3",
        ]
    )
    .rename(columns={"Global_active_power": "gap"})
)
df["timegroup"] = df.index.strftime("%Y-%m-%d-%H")
df = df.groupby("timegroup").agg("mean")
df.set_index(pd.to_datetime(df.index, format="%Y-%m-%d-%H"), inplace=True)
df.head()


In [None]:
mask = (df.index > '2008-03-01') & (df.index < '2008-03-14')
df.loc[mask, "gap"].plot(figsize=(16,6))

# Featute exploration

In [None]:
df["gap"].hist(bins=100)

In [None]:
df["hour"] = df.index.hour
df["day"] = df.index.day
df["month"] = df.index.month
df["dow"] = df.index.day_of_week
df["quarter"] = df.index.quarter

In [None]:
def plot_correlations(df: pd.DataFrame, target: str):
    for c in df.columns:
        if c == target:
            continue
        plt.figure()
        plt.scatter(df[c], df[target], s=1)
        plt.xlabel(c)
        plt.ylabel(target)
plot_correlations(df, "gap")

# Train / validation/ test split

In [None]:
df.index.min(), df.index.max()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def temporal_split(df: pd.DataFrame, cutoff_date: str):
    test_mask = df.index >= cutoff_date
    test_df = df.loc[test_mask].copy()
    train_valid_set = df.loc[~test_mask].copy()

    train_df, valid_df = train_test_split(train_valid_set, train_size=0.85)
    return train_df, valid_df, test_df
train_df, valid_df, test_df = temporal_split(df, '2010-01-01')

# Training

In [None]:
from lightgbm import LGBMRegressor

train_features = [c for c in train_df if c != "gap"]

regressor = LGBMRegressor(n_estimators=100, importance_type="gain")

booster = regressor.fit(
    train_df[train_features],
    train_df["gap"],
    eval_set=(valid_df[train_features], valid_df["gap"]),
    verbose=500,
    early_stopping_rounds=10
)



In [None]:
test_df["pred"] = booster.predict(test_df[train_features])
print(((test_df["pred"]-test_df["gap"])**2).mean())
(test_df["pred"]-test_df["gap"]).hist(bins=30)

In [None]:
min_date = '2010-04-01'
max_date = '2010-04-07'
mask = (test_df.index > min_date) & (test_df.index < max_date)
plt.figure(figsize=(12,6))
test_df.loc[mask, "gap"].plot(label="gap")
test_df.loc[mask, "pred"].plot(label="pred")
plt.legend()

In [None]:
plt.bar(train_features, booster.feature_importances_)

# Adding temporal component to the prediction

In [None]:
from typing import Union, List


def add_lag(df: pd.DataFrame, column: str, lag: Union[List[int], int]):
    if isinstance(lag, int):
        lag = [lag]
    lag_cols = df.filter(axis=1, regex="_lag_")
    df.drop(columns=lag_cols, inplace=True)
    for l in lag:
        slice = df[column][:-l].tolist()
        padded = [0] * l + slice
        new_col_name = f"{column}_lag_{l}"
        df[new_col_name] = padded

mins_multiplier = 1
add_lag(df, "gap", [1*mins_multiplier,2*mins_multiplier,3*mins_multiplier])
train_df, valid_df, test_df = temporal_split(df, '2010-01-01')
df.head()

In [None]:
train_features = [c for c in train_df if c != "gap"]
print(train_features)

regressor = LGBMRegressor(n_estimators=100, importance_type="gain")

booster = regressor.fit(
    train_df[train_features],
    train_df["gap"],
    eval_set=(valid_df[train_features], valid_df["gap"]),
    verbose=500,
    early_stopping_rounds=10
)


In [None]:
plt.figure(figsize=(12,6))
plt.bar(train_features, booster.feature_importances_)

In [None]:
test_df["pred"] = booster.predict(test_df[train_features])
print(((test_df["pred"]-test_df["gap"])**2).mean())
(test_df["pred"]-test_df["gap"]).hist(bins=30)

In [None]:
min_date = '2010-04-01'
max_date = '2010-04-07'
mask = (test_df.index > min_date) & (test_df.index < max_date)
plt.figure(figsize=(12,6))
test_df.loc[mask, "gap"].plot(label="gap")
test_df.loc[mask, "pred"].plot(label="pred")
plt.legend()