In [12]:
# Prophet Model Training for Indian Retail Sales Forecasting
# This script trains and saves ONLY the Prophet model to models/prophet_model.pkl

import pandas as pd
import numpy as np
import pickle
import os
import warnings
from datetime import datetime, timedelta

# Suppress warnings
warnings.filterwarnings('ignore')

# Import Prophet
try:
    from prophet import Prophet
except ImportError:
    print("Prophet not installed. Installing...")
    os.system('pip install prophet')
    from prophet import Prophet

def load_data():
    """Load and prepare retail sales data from cleaned_data.csv"""
    try:
        # Load the cleaned data from data directory
        df = pd.read_csv('../data/cleaned_data.csv')
        print(f"Loaded cleaned_data.csv successfully. Shape: {df.shape}")
        
        # Convert Date column to datetime
        df['Date'] = pd.to_datetime(df['Date'])
        
        return df
    except FileNotFoundError:
        print("Error: cleaned_data.csv not found in data/ directory")
        print("Please ensure the file exists at: data/cleaned_data.csv")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def prepare_prophet_data(df):
    """Prepare data for Prophet model from your dataset"""
    print("Preparing data for Prophet model...")
    
    # Check if we have the required columns
    required_cols = ['Date', 'Sales']
    if not all(col in df.columns for col in required_cols):
        print(f"Missing required columns. Available columns: {df.columns.tolist()}")
        return None
    
    # Aggregate daily sales across all stores if Store column exists
    if 'Store' in df.columns:
        print("Aggregating sales across all stores...")
        daily_sales = df.groupby('Date')['Sales'].sum().reset_index()
        print(f"Total stores in dataset: {df['Store'].nunique()}")
    else:
        # If no Store column, assume data is already aggregated
        daily_sales = df[['Date', 'Sales']].copy()
    
    # Rename columns for Prophet (requires 'ds' and 'y')
    daily_sales.columns = ['ds', 'y']
    
    # Ensure date column is datetime
    daily_sales['ds'] = pd.to_datetime(daily_sales['ds'])
    
    # Sort by date and remove duplicates
    daily_sales = daily_sales.sort_values('ds').drop_duplicates(subset=['ds']).reset_index(drop=True)
    
    # Remove any rows with missing values
    daily_sales = daily_sales.dropna()
    
    # Ensure positive sales values
    daily_sales['y'] = daily_sales['y'].abs()
    
    print(f"Prophet data prepared. Shape: {daily_sales.shape}")
    print(f"Date range: {daily_sales['ds'].min()} to {daily_sales['ds'].max()}")
    print(f"Sales range: {daily_sales['y'].min():,.2f} to {daily_sales['y'].max():,.2f}")
    print(f"Average daily sales: {daily_sales['y'].mean():,.2f}")
    
    return daily_sales

def create_custom_holidays():
    """Create custom holidays for Indian retail market"""
    holidays = pd.DataFrame({
        'holiday': 'diwali',
        'ds': pd.to_datetime(['2022-10-24', '2023-11-12']),
        'lower_window': -5,
        'upper_window': 5,
    })
    
    # Add other major Indian holidays
    holi_dates = pd.DataFrame({
        'holiday': 'holi',
        'ds': pd.to_datetime(['2022-03-18', '2023-03-08']),
        'lower_window': -2,
        'upper_window': 2,
    })
    
    # Add Eid dates
    eid_dates = pd.DataFrame({
        'holiday': 'eid',
        'ds': pd.to_datetime(['2022-05-03', '2022-07-10', '2023-04-22', '2023-06-29']),
        'lower_window': -2,
        'upper_window': 2,
    })
    
    # Combine all holidays
    all_holidays = pd.concat([holidays, holi_dates, eid_dates], ignore_index=True)
    return all_holidays

def train_prophet_model(data):
    """Train Prophet model optimized for Indian retail market"""
    print("Training Prophet model for Indian retail market...")
    
    # Create custom holidays
    custom_holidays = create_custom_holidays()
    
    # Create Prophet model with Indian market configurations
    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=False,
        seasonality_mode='additive',
        interval_width=0.95,
        changepoint_prior_scale=0.05,  # Flexibility for trend changes
        seasonality_prior_scale=10.0,   # Flexibility for seasonality
        holidays_prior_scale=10.0,      # Flexibility for holidays
        mcmc_samples=0,
        growth='linear',
        holidays=custom_holidays
    )
    
    # Add Indian country holidays
    model.add_country_holidays(country_name='IN')
    
    # Add custom seasonalities for Indian retail patterns
    model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.add_seasonality(name='quarterly', period=91.25, fourier_order=8)
    
    # Fit the model
    model.fit(data)
    
    return model

def save_model(model, filepath):
    """Save the trained Prophet model"""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    
    print(f"Prophet model saved to: {filepath}")

def main():
    """Main execution function"""
    print("=== Prophet Model Training for Indian Retail Sales ===\n")
    
    # Load data
    df = load_data()
    if df is None:
        print("Failed to load data. Exiting...")
        return
    
    print(f"Data loaded successfully. Shape: {df.shape}")
    print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
    if 'Store' in df.columns:
        print(f"Number of stores: {df['Store'].nunique()}")
    print(f"Total sales: {df['Sales'].sum():,.2f}\n")
    
    # Prepare data for Prophet
    prophet_data = prepare_prophet_data(df)
    if prophet_data is None:
        print("Failed to prepare data for Prophet. Exiting...")
        return
    
    print(f"Prophet data prepared. Shape: {prophet_data.shape}")
    print(f"Daily sales range: {prophet_data['y'].min():,.2f} to {prophet_data['y'].max():,.2f}\n")
    
    # Check if we have enough data points
    if len(prophet_data) < 30:
        print("Warning: Dataset has less than 30 days of data. Prophet may not perform well.")
    
    # Train Prophet model
    model = train_prophet_model(prophet_data)
    print("Prophet model training completed!\n")
    
    # Save model to models directory
    model_path = '../models/prophet_model.pkl'
    save_model(model, model_path)
    
    print("\n=== Prophet Model Training Completed Successfully! ===")
    print(f"Model saved at: {model_path}")
    print("Ready for use in your forecasting application!")

if __name__ == "__main__":
    main()

=== Prophet Model Training for Indian Retail Sales ===

Loaded cleaned_data.csv successfully. Shape: (1017209, 25)
Data loaded successfully. Shape: (1017209, 25)
Date range: 2013-01-01 00:00:00 to 2015-07-31 00:00:00
Number of stores: 1115
Total sales: 5,873,180,623.00

Preparing data for Prophet model...
Aggregating sales across all stores...
Total stores in dataset: 1115
Prophet data prepared. Shape: (942, 2)
Date range: 2013-01-01 00:00:00 to 2015-07-31 00:00:00
Sales range: 97,235.00 to 15,623,548.00
Average daily sales: 6,234,798.96
Prophet data prepared. Shape: (942, 2)
Daily sales range: 97,235.00 to 15,623,548.00

Training Prophet model for Indian retail market...


20:52:24 - cmdstanpy - INFO - Chain [1] start processing
20:52:24 - cmdstanpy - INFO - Chain [1] done processing


Prophet model training completed!

Prophet model saved to: ../models/prophet_model.pkl

=== Prophet Model Training Completed Successfully! ===
Model saved at: ../models/prophet_model.pkl
Ready for use in your forecasting application!
