In [9]:

"""
XGBoost Model for Retail Sales Forecasting
Trains an XGBoost model and saves it as xgb_model.pkl
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

def create_features(df):
    """Create time-based and retail-specific features"""
    df = df.copy()
    
    # Handle date column - check for different possible date column names
    date_cols = ['Date', 'date', 'DATE']
    date_col = None
    for col in date_cols:
        if col in df.columns:
            date_col = col
            break
    
    if date_col:
        df['Date'] = pd.to_datetime(df[date_col])
        
        # Time features
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Quarter'] = df['Date'].dt.quarter
        df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
        
        # Seasonal features
        df['IsFestivalSeason'] = df['Month'].isin([10, 11, 12]).astype(int)
        df['IsSummerSeason'] = df['Month'].isin([3, 4, 5]).astype(int)
        
        # Sort for lag features
        if 'Store' in df.columns:
            df = df.sort_values(['Store', 'Date'])
            
            # Lag features
            for lag in [1, 7, 30]:
                df[f'Sales_Lag_{lag}'] = df.groupby('Store')['Sales'].shift(lag)
            
            # Rolling statistics
            for window in [7, 30]:
                df[f'Sales_Mean_{window}'] = df.groupby('Store')['Sales'].rolling(window, min_periods=1).mean().reset_index(0, drop=True)
        else:
            # Simple lag features without store grouping
            for lag in [1, 7, 30]:
                df[f'Sales_Lag_{lag}'] = df['Sales'].shift(lag)
    
    return df.dropna()

def encode_categoricals(df):
    """Encode categorical variables"""
    encoders = {}
    categorical_cols = ['StoreType', 'Assortment', 'StateHoliday', 'SchoolHoliday']
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_Encoded'] = le.fit_transform(df[col].astype(str))
            encoders[col] = le
    
    return df, encoders

def main():
    print("🚀 Starting XGBoost Model Training...")
    
    # Load data
    try:
        df = pd.read_csv('../data/cleaned_data.csv')
    except FileNotFoundError:
        print("Error: cleaned_data.csv not found. Please ensure the data file exists.")
        return
    
    print(f"Dataset loaded: {df.shape[0]:,} records, {df.shape[1]} columns")
    
    # Check for required columns
    if 'Sales' not in df.columns:
        print("Error: 'Sales' column not found in dataset")
        return
    
    # Feature engineering
    df_featured = create_features(df)
    print(f"After feature engineering: {df_featured.shape[0]:,} records")
    
    # Encode categorical variables
    df_encoded, encoders = encode_categoricals(df_featured)
    
    # Select features for training
    base_features = ['Month', 'DayOfWeek', 'Quarter', 'IsWeekend', 'IsFestivalSeason', 'IsSummerSeason']
    
    # Add lag and rolling features if they exist
    lag_features = [col for col in df_encoded.columns if col.startswith(('Sales_Lag_', 'Sales_Mean_'))]
    base_features.extend(lag_features)
    
    # Add other numerical features
    optional_features = ['Store', 'Year', 'Promo', 'CompetitionDistance']
    base_features.extend([col for col in optional_features if col in df_encoded.columns])
    
    # Add encoded categorical features
    encoded_features = [col for col in df_encoded.columns if col.endswith('_Encoded')]
    base_features.extend(encoded_features)
    
    # Filter to only available numeric features
    available_features = []
    for col in base_features:
        if col in df_encoded.columns:
            if df_encoded[col].dtype in ['int64', 'float64', 'bool', 'int32', 'float32']:
                available_features.append(col)
    
    print(f"Selected {len(available_features)} features for training")
    
    # Prepare training data
    X = df_encoded[available_features].fillna(0)
    y = df_encoded['Sales']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    print(f"Training set: {X_train.shape[0]:,} samples")
    print(f"Test set: {X_test.shape[0]:,} samples")
    
    # Train XGBoost model
    print("Training XGBoost model...")
    xgb_model = xgb.XGBRegressor(
        n_estimators=200,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    xgb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = xgb_model.predict(X_train)
    y_pred_test = xgb_model.predict(X_test)
    
    # Calculate metrics
    def calculate_metrics(y_true, y_pred):
        return {
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'mae': mean_absolute_error(y_true, y_pred),
            'r2': r2_score(y_true, y_pred),
            'mape': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        }
    
    train_metrics = calculate_metrics(y_train, y_pred_train)
    test_metrics = calculate_metrics(y_test, y_pred_test)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Create models directory
    os.makedirs('../models', exist_ok=True)
    
    # Save model (single file as requested)
    model_path = '../models/xgb_model.pkl'
    model_data = {
        'model': xgb_model,
        'features': available_features,
        'encoders': encoders,
        'metrics': {'train': train_metrics, 'test': test_metrics},
        'feature_importance': feature_importance
    }
    
    with open(model_path, 'wb') as f:
        pickle.dump(model_data, f)
    
    # Print results
    print("\n" + "="*50)
    print("    XGBOOST MODEL TRAINING COMPLETE")
    print("="*50)
    print(f"📊 Dataset Size: {df_encoded.shape[0]:,} records")
    print(f"🎯 Features Used: {len(available_features)}")
    print(f"📈 Train R²: {train_metrics['r2']:.4f}")
    print(f"📉 Test R²: {test_metrics['r2']:.4f}")
    print(f"💰 Test RMSE: ₹{test_metrics['rmse']:,.0f}")
    print(f"📊 Test MAE: ₹{test_metrics['mae']:,.0f}")
    print(f"📋 Test MAPE: {test_metrics['mape']:.2f}%")
    print("="*50)
    print(f"✅ Model saved to: {model_path}")
    print("\n🔍 Top 5 Important Features:")
    for i, (_, row) in enumerate(feature_importance.head(5).iterrows(), 1):
        print(f"  {i}. {row['feature']}: {row['importance']:.4f}")
    
    # Test model loading
    print("\n🧪 Testing model loading...")
    with open(model_path, 'rb') as f:
        loaded_data = pickle.load(f)
    
    loaded_model = loaded_data['model']
    test_pred = loaded_model.predict(X_test[:5])
    original_pred = xgb_model.predict(X_test[:5])
    
    if np.allclose(test_pred, original_pred):
        print("✅ Model loading test successful!")
    else:
        print("❌ Model loading test failed!")
    
    print("\n🚀 Model ready for deployment!")

if __name__ == "__main__":
    main()

🚀 Starting XGBoost Model Training...
Dataset loaded: 1,017,209 records, 25 columns
After feature engineering: 983,759 records
Selected 19 features for training
Training set: 787,007 samples
Test set: 196,752 samples
Training XGBoost model...

    XGBOOST MODEL TRAINING COMPLETE
📊 Dataset Size: 983,759 records
🎯 Features Used: 19
📈 Train R²: 0.9677
📉 Test R²: 0.9635
💰 Test RMSE: ₹735
📊 Test MAE: ₹471
📋 Test MAPE: inf%
✅ Model saved to: ../models/xgb_model.pkl

🔍 Top 5 Important Features:
  1. IsWeekend: 0.3152
  2. StateHoliday_Encoded: 0.2890
  3. DayOfWeek: 0.1087
  4. Sales_Mean_7: 0.0786
  5. Promo: 0.0761

🧪 Testing model loading...
✅ Model loading test successful!

🚀 Model ready for deployment!
