In [3]:
import pandas as pd

df = pd.read_csv("../data/train.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")

STORE_ID = 1
PRODUCT_FAMILY = df["family"].unique()[0]

ts_df = df[
    (df["store_nbr"] == STORE_ID) &
    (df["family"] == PRODUCT_FAMILY)
][["date", "sales"]]

ts_df = ts_df.set_index("date")

ts_df.head()


Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2013-01-01,0.0
2013-01-02,2.0
2013-01-03,3.0
2013-01-04,3.0
2013-01-05,5.0


In [4]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_sales = scaler.fit_transform(ts_df[["sales"]])

scaled_sales[:5]


array([[0.        ],
       [0.10526316],
       [0.15789474],
       [0.15789474],
       [0.26315789]])

In [6]:
def create_sequences(data, window_size=30):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)


In [7]:
WINDOW_SIZE = 30

X, y = create_sequences(scaled_sales, WINDOW_SIZE)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (1654, 30, 1)
y shape: (1654, 1)


In [8]:
split_index = int(len(X) * 0.8)

X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

X_train.shape, X_test.shape


((1323, 30, 1), (331, 30, 1))

In [9]:
np.save("../data/X_train.npy", X_train)
np.save("../data/X_test.npy", X_test)
np.save("../data/y_train.npy", y_train)
np.save("../data/y_test.npy", y_test)
