In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:

store_sales_time_series_forecasting_path = kagglehub.competition_download('store-sales-time-series-forecasting')

print('Data source import complete.')


# Sales Prediction with LightGBM ( store sales)

Enhanced version:
- More lag/rolling features (short and long-term)
- Exponentially weighted moving averages
- Holiday & promotion interaction features
- Proper categorical encoding

## Exploratory Time Series Analysis

To understand trends and autocorrelation, let's visualize an example store-family combination's sales.

In [None]:
!pip install --quiet lightgbm statsmodels

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from datetime import timedelta
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf

## Load Data

Load the train, test, stores, oil prices, and holidays datasets.  
Parse the date columns for proper time series handling.

In [None]:
INPUT_DIR = "/kaggle/input/store-sales-time-series-forecasting"
train = pd.read_csv(os.path.join(INPUT_DIR, "train.csv"), parse_dates=["date"])
test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"), parse_dates=["date"])
stores = pd.read_csv(os.path.join(INPUT_DIR, "stores.csv"))
oil = pd.read_csv(os.path.join(INPUT_DIR, "oil.csv"), parse_dates=["date"])
holidays = pd.read_csv(os.path.join(INPUT_DIR, "holidays_events.csv"), parse_dates=["date"])

## Quick Preprocessing

- Create a unique identifier for store-family combinations  
- Forward-fill oil prices  
- Filter holidays to national only and create a flag  
- Merge exogenous features (oil, holidays) into train/test  
- Encode store metadata

In [None]:
train["unique_id"] = train["store_nbr"].astype(str) + "_" + train["family"]
test["unique_id"] = test["store_nbr"].astype(str) + "_" + test["family"]

# Oil: forward fill
oil = oil.set_index("date").asfreq("D").ffill().reset_index().rename(columns={"dcoilwtico":"oil_price"})

# Holidays: national only
holidays = holidays[holidays["locale"]=="National"].copy()
holidays["holiday_flag"] = 1
holidays = holidays[["date","holiday_flag"]].drop_duplicates()

def merge_exog(df):
    df = df.merge(oil, on="date", how="left")
    df = df.merge(holidays, on="date", how="left")
    df["holiday_flag"] = df["holiday_flag"].fillna(0)
    return df

train = merge_exog(train)
test = merge_exog(test)

# Encode store metadata
stores = stores.rename(columns={"cluster":"store_cluster"})
stores["store_nbr"] = stores["store_nbr"].astype(int)

## Exploratory Analysis Plots

- Plot the full sales series for a representative store-family combination  
- Zoom in on the first 36 days to see short-term trends  
- Plot autocorrelation to examine temporal dependencies  
- Perform Augmented Dickey-Fuller test to check stationarity

In [None]:
# Select an example store-family combination
example_uid = train.groupby("unique_id")["sales"].sum().idxmax()
ts = train[train["unique_id"]==example_uid].sort_values("date")

# Full series plot
plt.figure(figsize=(12,5))
plt.plot(ts["date"], ts["sales"], color="blue", linestyle="-")
plt.title(f"Sales over time for {example_uid}")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

# Zoom-in plot (first 36 points)
plt.figure(figsize=(12,5))
plt.plot(ts["date"][:36], ts["sales"][:36], color="blue", linestyle="-")
plt.title(f"Zoom-in on first 36 days for {example_uid}")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

# Autocorrelation plot
plt.figure(figsize=(12,4))
plot_acf(ts["sales"], lags=50)
plt.title(f"Autocorrelation of Sales for {example_uid}")
plt.show()

# Augmented Dickey-Fuller test
adf_result = adfuller(ts["sales"])
print(f"ADF Statistic: {adf_result[0]:.4f}")
print(f"p-value: {adf_result[1]:.4f}")
for key, value in adf_result[4].items():
    print(f"Critical Value ({key}): {value:.4f}")

## Prepare Panel Data

- Concatenate train and test to create a full panel  
- Sort by unique_id and date  
- Merge store metadata  
- Generate basic date features (day of week, month, year, week of year)  
- Label encode categorical features

In [None]:
test["sales"] = np.nan
df_all = pd.concat([train, test], sort=False).reset_index(drop=True)
df_all = df_all.sort_values(["unique_id","date"]).reset_index(drop=True)

# Merge store metadata
df_all = df_all.merge(stores, on="store_nbr", how="left")

# Basic date features
df_all["dow"] = df_all["date"].dt.dayofweek
df_all["day"] = df_all["date"].dt.day
df_all["month"] = df_all["date"].dt.month
df_all["year"] = df_all["date"].dt.year
df_all["weekofyear"] = df_all["date"].dt.isocalendar().week.astype(int)

# Label encode categorical columns
for c in ["family","city","state","type","store_cluster"]:
    df_all[c] = df_all[c].astype(str)
    le = LabelEncoder()
    df_all[c] = le.fit_transform(df_all[c].fillna("NA"))

## Create Lags, Rolling Windows, EWMA

- Generate lag features for short and long-term dependencies  
- Compute rolling mean and std for multiple windows  
- Compute exponentially weighted moving averages  
- Create lagged promotion features and holiday interactions  
- Fill missing values with median per series or zero

In [None]:
LAGS = [1,2,3,7,14,21,28,56]
ROLL_WINDOWS = [3,7,14,28,56]

for lag in LAGS:
    df_all[f"lag_{lag}"] = df_all.groupby("unique_id")["sales"].shift(lag)

for w in ROLL_WINDOWS:
    df_all[f"rmean_{w}"] = df_all.groupby("unique_id")["sales"].shift(1).rolling(window=w, min_periods=1).mean().reset_index(level=0, drop=True)
    df_all[f"rstd_{w}"] = df_all.groupby("unique_id")["sales"].shift(1).rolling(window=w, min_periods=1).std().reset_index(level=0, drop=True)
    df_all[f"ewm_{w}"] = df_all.groupby("unique_id")["sales"].shift(1).transform(lambda x: x.ewm(span=w, adjust=False).mean())

df_all["promo_lag_1"] = df_all.groupby("unique_id")["onpromotion"].shift(1)
df_all["promo_roll_7"] = df_all.groupby("unique_id")["onpromotion"].shift(1).rolling(window=7, min_periods=1).mean().reset_index(level=0, drop=True)

df_all["days_to_holiday"] = df_all.groupby("unique_id")["holiday_flag"].shift(1).fillna(0)
df_all["days_to_holiday_7"] = df_all.groupby("unique_id")["holiday_flag"].shift(1).rolling(7, min_periods=1).sum().reset_index(level=0, drop=True)

group_median = df_all.groupby("unique_id")["sales"].transform("median")
lag_cols = [c for c in df_all.columns if "lag_" in c or "rmean_" in c or "rstd_" in c or "ewm_" in c or "promo_" in c or "days_to_holiday" in c]
for col in lag_cols:
    df_all[col] = df_all[col].fillna(group_median).fillna(0.0)

df_all["oil_price"] = df_all["oil_price"].fillna(df_all["oil_price"].median())

## Train/Validation Split

- Use last `h` days as validation, rest as training  
- Identify categorical features for LightGBM  
- Create LightGBM datasets with categorical handling

In [None]:
h = test["date"].nunique()
train_feat = df_all[df_all["sales"].notna()].copy()
test_feat = df_all[df_all["sales"].isna()].copy()

features = [f for f in df_all.columns if f not in ["id","date","sales","unique_id","store_nbr"]]

X_train = train_feat[train_feat["date"] < train_feat["date"].max() - pd.Timedelta(days=h-1)][features]
y_train = train_feat[train_feat["date"] < train_feat["date"].max() - pd.Timedelta(days=h-1)]["sales"]

X_val = train_feat[train_feat["date"] >= train_feat["date"].max() - pd.Timedelta(days=h-1)][features]
y_val = train_feat[train_feat["date"] >= train_feat["date"].max() - pd.Timedelta(days=h-1)]["sales"]

cat_features = [c for c in ["family","city","state","type","store_cluster","dow","month","year","weekofyear"] if c in features]

lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_features, reference=lgb_train, free_raw_data=False)

params = {
    "objective": "rmse",
    "metric": "rmse",
    "learning_rate": 0.036,
    "num_leaves": 196,
    "min_data_in_leaf": 494,
    "feature_fraction": 0.605,
    "bagging_freq": 1,
    "bagging_fraction": 0.606,
    "seed": 2025,
    "verbosity": -1,
    "n_jobs": -1
}

## Train LightGBM

Train a LightGBM model with early stopping on validation set.  
Logs RMSE every 50 rounds.

In [None]:
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=["train","valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

## Validation RMSLE

Evaluate model predictions on the validation set using RMSLE (Root Mean Squared Log Error).

In [None]:
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
val_pred = np.clip(val_pred, 0, None)
rmsle = np.sqrt(np.mean((np.log1p(val_pred) - np.log1p(y_val))**2))
print(f"Validation RMSLE: {rmsle:.6f}")

## Feature Importance

Visualize the top 20 features according to gain from the LightGBM model.

In [None]:
imp = pd.DataFrame({"feature": model.feature_name(), "gain": model.feature_importance("gain")})
imp = imp.sort_values("gain", ascending=False).reset_index(drop=True)
plt.figure(figsize=(8,6))
plt.barh(imp["feature"].head(20)[::-1], imp["gain"].head(20)[::-1])
plt.title("Top 20 feature importances (gain)")
plt.tight_layout()
plt.show()

## Predict on Test Set and Save Submission

- Predict sales on the test set using the trained model  
- Fill missing predictions with the median sales per series  
- Clip predictions to avoid negative values  
- Save submission CSV

In [None]:
X_test = test_feat[features].copy()
preds_test = model.predict(X_test, num_iteration=model.best_iteration)
preds_test = np.clip(preds_test, 0, None)

test_feat["pred_sales"] = preds_test

# merge back to original test
test_out = test.merge(test_feat[["unique_id","date","pred_sales"]], on=["unique_id","date"], how="left")
median_sales = train.groupby("unique_id")["sales"].median().to_dict()
test_out["sales"] = test_out.apply(lambda r: median_sales.get(r["unique_id"], 0.0) if pd.isna(r["pred_sales"]) else r["pred_sales"], axis=1)
test_out["sales"] = test_out["sales"].clip(lower=0.0)

submission = test_out[["id","sales"]].copy()
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv with", submission.shape[0], "rows")

## Plot Example Forecast

- Plot historical sales and predicted sales for the same example store-family combination  
- Visual comparison helps to assess forecast quality visually

In [None]:
example_uid = train.groupby("unique_id")["sales"].sum().idxmax()
hist = train[train["unique_id"]==example_uid].sort_values("date")
preds_example = test_out[test_out["unique_id"]==example_uid].sort_values("date")

plt.figure(figsize=(12,5))
plt.plot(hist["date"], hist["sales"], label="History", color="tab:blue")
plt.plot(preds_example["date"], preds_example["sales"], label="Predicted (test)", color="tab:orange")
plt.title(f"History & predictions for {example_uid}")
plt.xlabel("Date"); plt.ylabel("Sales")
plt.legend(); plt.grid(True, linestyle="--", alpha=0.6)
plt.show()