# Walmart Sales Forecasting System

This notebook implements a comprehensive sales forecasting system based on research paper methodologies.

## Overview
- **Dataset**: Walmart Store Sales Forecasting (Kaggle)
- **Goal**: Compare baseline models vs research-style models
- **Evaluation**: RMSE, MAE, MAPE

## Step 1: Load Dataset

In [None]:
import sys
import os
sys.path.append('../')

from src.data_loader import load_dataset
import pandas as pd
import numpy as np

# Load dataset
train_df, test_df, features_df, stores_df = load_dataset()

print(f"\nTrain dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns)}")

## Step 2: Exploratory Data Analysis

In [None]:
from src.eda import generate_eda_report, check_missing_values, detect_outliers, plot_sales_trends

# Generate comprehensive EDA report
generate_eda_report(train_df, save_dir='../results')

## Step 3: Feature Engineering

In [None]:
from src.feature_engineering import create_all_features, get_feature_columns

# Create all features
train_df_featured = create_all_features(
    train_df.copy(),
    target_col='Weekly_Sales',
    lags=[1, 2, 4, 12],
    rolling_windows=[4, 8, 12]
)

# Remove rows with NaN from lag features (first few rows)
train_df_featured = train_df_featured.dropna(subset=['Weekly_Sales']).reset_index(drop=True)

print(f"\nDataset shape after feature engineering: {train_df_featured.shape}")
print(f"\nFeature columns: {len(get_feature_columns(train_df_featured))}")

## Step 4: Time-Based Train-Test Split

In [None]:
from src.utils import time_based_split

# Split data by time
train_data, val_data = time_based_split(train_df_featured, date_col='Date', train_ratio=0.8)

# Prepare features for ML models
from src.utils import prepare_ml_data
from src.feature_engineering import get_feature_columns

feature_cols = get_feature_columns(train_data)

X_train, y_train = prepare_ml_data(train_data, feature_cols=feature_cols)
X_val, y_val = prepare_ml_data(val_data, feature_cols=feature_cols)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")

## Step 5: Baseline Models

In [None]:
from src.models import NaiveForecast, MovingAverage
from src.evaluation import evaluate_model

# Naive Forecast
print("\n" + "="*60)
print("Training Baseline Models")
print("="*60)

naive_model = NaiveForecast()
naive_model.fit(train_data, target_col='Weekly_Sales')
naive_pred = naive_model.predict(val_data)
naive_results = evaluate_model(y_val, naive_pred, "Naive Forecast")
print(f"\nNaive Forecast - RMSE: {naive_results['RMSE']:.2f}, MAE: {naive_results['MAE']:.2f}, MAPE: {naive_results['MAPE']:.2f}%")

# Moving Average
ma_model = MovingAverage(window=4)
ma_model.fit(train_data, target_col='Weekly_Sales')
ma_pred = ma_model.predict(val_data)
ma_results = evaluate_model(y_val, ma_pred, "Moving Average (4 weeks)")
print(f"Moving Average - RMSE: {ma_results['RMSE']:.2f}, MAE: {ma_results['MAE']:.2f}, MAPE: {ma_results['MAPE']:.2f}%")

## Step 6: Research-Style Models

### 6.1 Statistical Model: SARIMA

In [None]:
from src.models import SARIMAModel

# Note: SARIMA can be slow for many time series
# We'll use a sample for demonstration
print("\n" + "="*60)
print("Training SARIMA Model")
print("="*60)
print("Note: This may take a while. Using sample data for speed...")

# Use sample stores/depts for faster training
sample_stores = train_data['Store'].unique()[:5]  # Use first 5 stores
sample_train = train_data[train_data['Store'].isin(sample_stores)].copy()
sample_val = val_data[val_data['Store'].isin(sample_stores)].copy()

sarima_model = SARIMAModel(order=(1, 1, 1), seasonal_order=(1, 1, 1, 52))
sarima_model.fit(sample_train, target_col='Weekly_Sales')
sarima_pred = sarima_model.predict(sample_val)

if len(sample_val) > 0:
    sarima_y_val = sample_val['Weekly_Sales'].values
    sarima_results = evaluate_model(sarima_y_val, sarima_pred, "SARIMA")
    print(f"\nSARIMA - RMSE: {sarima_results['RMSE']:.2f}, MAE: {sarima_results['MAE']:.2f}, MAPE: {sarima_results['MAPE']:.2f}%")
else:
    sarima_results = {'Model': 'SARIMA', 'RMSE': np.inf, 'MAE': np.inf, 'MAPE': np.inf}

### 6.2 Statistical Model: Prophet

In [None]:
from src.models import ProphetModel

print("\n" + "="*60)
print("Training Prophet Model")
print("="*60)
print("Note: This may take a while. Using sample data for speed...")

# Use same sample for consistency
prophet_model = ProphetModel(yearly_seasonality=True, weekly_seasonality=True)
prophet_model.fit(sample_train, target_col='Weekly_Sales', date_col='Date')
prophet_pred = prophet_model.predict(sample_val, date_col='Date')

if len(sample_val) > 0:
    prophet_y_val = sample_val['Weekly_Sales'].values
    prophet_results = evaluate_model(prophet_y_val, prophet_pred, "Prophet")
    print(f"\nProphet - RMSE: {prophet_results['RMSE']:.2f}, MAE: {prophet_results['MAE']:.2f}, MAPE: {prophet_results['MAPE']:.2f}%")
else:
    prophet_results = {'Model': 'Prophet', 'RMSE': np.inf, 'MAE': np.inf, 'MAPE': np.inf}

### 6.3 Machine Learning Model: LightGBM

In [None]:
from src.models import LightGBMModel

print("\n" + "="*60)
print("Training LightGBM Model")
print("="*60)

lgb_model = LightGBMModel()
lgb_model.fit(X_train, y_train, X_val, y_val)
lgb_pred = lgb_model.predict(X_val)
lgb_results = evaluate_model(y_val, lgb_pred, "LightGBM")
print(f"\nLightGBM - RMSE: {lgb_results['RMSE']:.2f}, MAE: {lgb_results['MAE']:.2f}, MAPE: {lgb_results['MAPE']:.2f}%")

### 6.4 Machine Learning Model: XGBoost

In [None]:
from src.models import XGBoostModel

print("\n" + "="*60)
print("Training XGBoost Model")
print("="*60)

xgb_model = XGBoostModel()
xgb_model.fit(X_train, y_train, X_val, y_val)
xgb_pred = xgb_model.predict(X_val)
xgb_results = evaluate_model(y_val, xgb_pred, "XGBoost")
print(f"\nXGBoost - RMSE: {xgb_results['RMSE']:.2f}, MAE: {xgb_results['MAE']:.2f}, MAPE: {xgb_results['MAPE']:.2f}%")

### 6.5 Deep Learning Model: LSTM

In [None]:
from src.models import LSTMModel

print("\n" + "="*60)
print("Training LSTM Model")
print("="*60)

# LSTM requires special handling
lstm_model = LSTMModel(sequence_length=12, units=50, epochs=20, batch_size=32)
lstm_model.fit(train_data, target_col='Weekly_Sales', feature_cols=feature_cols)
lstm_pred = lstm_model.predict(val_data, feature_cols=feature_cols)
lstm_results = evaluate_model(y_val, lstm_pred, "LSTM")
print(f"\nLSTM - RMSE: {lstm_results['RMSE']:.2f}, MAE: {lstm_results['MAE']:.2f}, MAPE: {lstm_results['MAPE']:.2f}%")

## Step 7: Model Comparison

In [None]:
from src.evaluation import compare_models, plot_predictions, plot_all_predictions

# Collect all results
all_results = [
    naive_results,
    ma_results,
    sarima_results,
    prophet_results,
    lgb_results,
    xgb_results,
    lstm_results
]

# Compare models
comparison_df = compare_models(all_results, save_path='../results/model_comparison.csv')

## Step 8: Visualization

In [None]:
# Plot predictions for all models
predictions_dict = {
    'Naive Forecast': {'y_true': y_val, 'y_pred': naive_pred},
    'Moving Average': {'y_true': y_val, 'y_pred': ma_pred},
    'LightGBM': {'y_true': y_val, 'y_pred': lgb_pred},
    'XGBoost': {'y_true': y_val, 'y_pred': xgb_pred}
}

# Add SARIMA and Prophet if available
if len(sample_val) > 0:
    predictions_dict['SARIMA'] = {'y_true': sarima_y_val, 'y_pred': sarima_pred}
    predictions_dict['Prophet'] = {'y_true': prophet_y_val, 'y_pred': prophet_pred}

plot_all_predictions(predictions_dict, save_path='../results/all_predictions.png')

# Individual plots for top models
plot_predictions(y_val, lgb_pred, 'LightGBM', save_path='../results/lightgbm_predictions.png')
plot_predictions(y_val, xgb_pred, 'XGBoost', save_path='../results/xgboost_predictions.png')

## Summary

The forecasting pipeline has been completed. All results are saved in the `results/` directory.