In [1]:
import pandas as pd

df = pd.read_csv(
    "../data/processed/power_1min.csv",
    parse_dates=["Datetime"],
    index_col="Datetime"
)

df = df.sort_index()
df.head()


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [2]:
TARGET = "Global_active_power"
feature_cols = [TARGET]


In [3]:
n = len(df)

train_end = int(n * 0.7)
val_end   = int(n * 0.85)

train_df = df.iloc[:train_end]
val_df   = df.iloc[train_end:val_end]
test_df  = df.iloc[val_end:]

train_df.shape, val_df.shape, test_df.shape


((1452681, 7), (311289, 7), (311289, 7))

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

train_scaled = scaler.fit_transform(train_df[feature_cols])
val_scaled   = scaler.transform(val_df[feature_cols])
test_scaled  = scaler.transform(test_df[feature_cols])


In [5]:
import numpy as np

def make_windows(data, lookback=60, horizon=1):
    X, y = [], []
    for i in range(lookback, len(data) - horizon + 1):
        X.append(data[i-lookback:i])
        y.append(data[i + horizon - 1, 0])
    return np.array(X), np.array(y).reshape(-1, 1)

LOOKBACK = 60
HORIZON = 1

X_train, y_train = make_windows(train_scaled, LOOKBACK, HORIZON)
X_val, y_val     = make_windows(val_scaled, LOOKBACK, HORIZON)
X_test, y_test   = make_windows(test_scaled, LOOKBACK, HORIZON)

X_train.shape, y_train.shape


((1452621, 60, 1), (1452621, 1))

In [6]:
import os
import joblib

os.makedirs("../runs", exist_ok=True)

np.savez_compressed(
    "../runs/dataset.npz",
    X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val,
    X_test=X_test, y_test=y_test
)

joblib.dump(scaler, "../runs/minmax_scaler.pkl")


['../runs/minmax_scaler.pkl']

In [7]:
d = np.load("../runs/dataset.npz")
d["X_train"].shape, d["y_train"].shape


((1452621, 60, 1), (1452621, 1))