# Model Training with Shop Assortment Features

This notebook implements Phase 2 of the plan outlined in `SHOP_ASSORTMENT_FEATURE_ENGINEERING_PLAN.md`.

**Objective:** Retrain the XGBoost model using the augmented dataset that includes new shop-level and product-relative-to-shop features, and evaluate its performance.

## 1. Setup and Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import pickle
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

## 2. Load Augmented Data

Load the `agg_data_with_shop_features.csv` file created by the `shop_assortment_feature_engineering.ipynb` notebook.

In [None]:
data_path = "../src/data/processed/agg_data_with_shop_features.csv"
print(f"[DATA] Loading augmented data from: {data_path}...")
try:
    augmented_data = pd.read_csv(data_path)
    print(f"Loaded {len(augmented_data)} records.")
    print(f"Columns: {list(augmented_data.columns)}")
    display(augmented_data.head())
    print(f"Shape: {augmented_data.shape}")
except FileNotFoundError:
    print(f"ERROR: {data_path} not found. Please ensure the feature engineering notebook was run successfully.")
    augmented_data = pd.DataFrame() # Ensure augmented_data exists

## 3. Feature Preparation & Encoding

Prepare features (X) and target (y, y_log). This includes identifying all categorical columns (original + new) and applying Label Encoding.

In [None]:
if not augmented_data.empty:
    print("[FEATURES] Preparing features and targets...")
    
    # Define target variable
    y = augmented_data['selection_count']
    y_log = np.log1p(y) # Log-transformed target
    
    # Define features (X) - all columns except 'selection_count'
    X = augmented_data.drop(columns=['selection_count']).copy()
    
    # Identify all categorical features for encoding
    # Original 11 features were all treated as categorical (after ensuring string type)
    original_grouping_cols = [
        'employee_shop', 'employee_branch', 'employee_gender',
        'product_main_category', 'product_sub_category', 'product_brand',
        'product_color', 'product_durability', 'product_target_gender',
        'product_utility_type', 'product_type'
    ]
    
    # New categorical features from shop_features (example, verify actual columns from previous notebook output)
    # These were 'shop_most_frequent_main_category_selected', 'shop_most_frequent_brand_selected'
    new_categorical_features = []
    if 'shop_most_frequent_main_category_selected' in X.columns:
        new_categorical_features.append('shop_most_frequent_main_category_selected')
    if 'shop_most_frequent_brand_selected' in X.columns:
        new_categorical_features.append('shop_most_frequent_brand_selected')
        
    all_categorical_cols = list(set(original_grouping_cols + new_categorical_features))
    
    # Ensure all columns to be encoded are actually in X and are of string type before encoding
    # Numerical features (like diversity scores, ranks, shares) should not be in all_categorical_cols
    
    print("\n[ENCODE] Label encoding categorical features...")
    label_encoders = {}
    for col in X.columns:
        # Check if column is intended to be categorical based on our list, or if it's object/string type
        # This logic might need refinement based on the exact nature of all new features
        if col in all_categorical_cols or X[col].dtype == 'object':
            print(f"  Encoding: {col}")
            X[col] = X[col].astype(str) # Ensure string type for encoder
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
            label_encoders[col] = le
        elif pd.api.types.is_numeric_dtype(X[col]):
             # For numeric types, fill NaNs if any (e.g. from merges or calculations)
            if X[col].isnull().any():
                print(f"  Filling NaNs in numeric column: {col}")
                X[col] = X[col].fillna(X[col].median()) # Or use 0 or mean
        else:
            print(f"  Skipping encoding for non-categorical or already numeric: {col} (dtype: {X[col].dtype})")
            
    # Handle any remaining NaNs in the feature matrix (e.g., for numeric columns if not caught above)
    if X.isnull().values.any():
        print("\n[NAN_FILL] Filling remaining NaNs in X with 0 (median or mean might be better for some features)... ")
        X = X.fillna(0) # A simple strategy; might need refinement for specific features
        
    # Create stratification for CV (based on original selection_count)
    y_strata = pd.cut(y, bins=[0, 1, 2, 5, 10, np.inf], labels=[0, 1, 2, 3, 4], include_lowest=True)
    
    print(f"\nFeatures shape: {X.shape}")
    print(f"Target shapes: Original={y.shape}, Log={y_log.shape}")
    print(f"Stratification distribution:\n{y_strata.value_counts().sort_index().to_dict()}")
    display(X.head())
    print(f"\nSample-to-feature ratio: {len(X) / X.shape[1]:.1f}:1")
else:
    print("Skipping Feature Preparation as augmented_data is empty.")
    X = pd.DataFrame()
    y_log = pd.Series()
    y_strata = pd.Series()

## 4. Model Training and Evaluation

Train the XGBoost model with the optimal configuration and evaluate using stratified cross-validation.

In [None]:
if not X.empty:
    # Optimal XGBoost configuration (from breakthrough_training.ipynb)
    model = XGBRegressor(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.3,
        reg_lambda=0.3,
        gamma=0.1,
        min_child_weight=8,
        random_state=42,
        n_jobs=-1
    )
    
    print("\n[TRAIN] Training XGBoost model with augmented features on log-transformed target...")
    
    # Stratified Cross-Validation
    cv_stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Ensure y_strata has no NaNs if y had NaNs not present in X
    valid_indices = y_strata.dropna().index
    X_cv = X.loc[valid_indices]
    y_log_cv = y_log.loc[valid_indices]
    y_strata_cv = y_strata.loc[valid_indices]
    
    if len(X_cv) > 0:
        cv_scores = cross_val_score(model, X_cv, y_log_cv, cv=cv_stratified.split(X_cv, y_strata_cv), scoring='r2')
        r2_cv_mean = cv_scores.mean()
        r2_cv_std = cv_scores.std()
        print(f"Stratified CV R² (on y_log): {r2_cv_mean:.4f} ± {r2_cv_std:.4f}")
    else:
        print("Not enough data for CV after handling NaNs in strata.")
        r2_cv_mean, r2_cv_std = 0, 0

    # Train on full data for validation split and feature importance
    X_train, X_val, y_log_train, y_log_val, y_train, y_val = train_test_split(
        X, y_log, y, test_size=0.2, random_state=42, stratify=y_strata
    )
    
    model.fit(X_train, y_log_train)
    
    # Predictions on validation set (log scale)
    y_log_pred_val = model.predict(X_val)
    r2_val_log = r2_score(y_log_val, y_log_pred_val)
    print(f"Validation R² (on y_log): {r2_val_log:.4f}")
    
    # Overfitting check
    overfitting = r2_val_log - r2_cv_mean
    print(f"Overfitting (Validation R² - CV R²): {overfitting:+.4f}")
    
    # Evaluate on original scale (for MAE, RMSE)
    y_pred_val_original_scale = np.expm1(y_log_pred_val)
    # Clip negative predictions if any after expm1, though less likely with XGBoost default objective
    y_pred_val_original_scale = np.maximum(0, y_pred_val_original_scale) 
    
    mae_original = mean_absolute_error(y_val, y_pred_val_original_scale)
    rmse_original = mean_squared_error(y_val, y_pred_val_original_scale, squared=False)
    r2_original_val = r2_score(y_val, y_pred_val_original_scale) # R2 on original scale for reference

    print(f"\nMetrics on Original Scale (Validation Set):")
    print(f"  MAE: {mae_original:.4f}")
    print(f"  RMSE: {rmse_original:.4f}")
    print(f"  R² (original scale): {r2_original_val:.4f}") 
    
    print("\n--- Compare with Baseline R² (0.2947 on log-transformed target) ---")
    improvement = r2_cv_mean - 0.2947
    print(f"Improvement in CV R² (log target) over baseline: {improvement:+.4f}")
else:
    print("Skipping Model Training as X is empty.")

## 5. Feature Importance Analysis (New Model)

In [None]:
if not X.empty and 'model' in locals():
    print("\n[ANALYSIS] Feature Importance for Augmented Model")
    
    # Model should have been trained on the full X_train in the previous cell for this analysis
    # Or, retrain on the full X for a final importance view (as done in breakthrough notebook)
    # For consistency, let's use the model already trained on X_train, y_log_train
    # final_model_for_importance = model # if model was trained on full X
    # For now, using the model trained on the training split of the CV
    
    feature_importance = model.feature_importances_
    feature_names = X.columns # Use columns from the prepared X
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance Ranking (New Model):")
    for i, row in importance_df.iterrows():
        print(f"  {i+1:2d}. {row['feature']:<45} {row['importance']:.4f}")
    
    plt.figure(figsize=(12, max(8, len(importance_df) * 0.3) )) # Adjust height based on num features
    sns.barplot(x='importance', y='feature', data=importance_df, palette='viridis')
    plt.title('XGBoost Feature Importance - Augmented Model')
    plt.tight_layout()
    plt.show()
    
    top_features = importance_df.head(10)
    print(f"\n[TOP 10] Most Important Features (New Model):")
    for _, row in top_features.iterrows():
        print(f"   {row['feature']:<45} {row['importance']:.4f}")
else:
    print("Skipping Feature Importance as X is empty or model not trained.")

## 6. Save New Model and Encoders (Optional)

If the new model shows significant improvement, save it along with its specific label encoders.

In [None]:
# Example: Only save if R2_cv_mean > 0.3 (e.g., better than baseline)
save_model_threshold = 0.30 # Adjust as needed

if not X.empty and 'model' in locals() and 'r2_cv_mean' in locals() and r2_cv_mean > save_model_threshold:
    print(f"\n[SAVE] New model performance ({r2_cv_mean:.4f}) exceeds threshold ({save_model_threshold:.4f}). Saving model...")
    
    model_dir = '../models/augmented_model/'
    os.makedirs(model_dir, exist_ok=True)
    
    # Save the trained model (trained on X_train, y_log_train)
    new_model_path = os.path.join(model_dir, 'augmented_xgb_model.pkl')
    joblib.dump(model, new_model_path)
    print(f"[OK] Augmented model saved to: {new_model_path}")
    
    # Save label encoders used for this augmented dataset
    new_encoders_path = os.path.join(model_dir, 'augmented_label_encoders.pkl')
    with open(new_encoders_path, 'wb') as f:
        pickle.dump(label_encoders, f) # label_encoders from cell 3
    print(f"[OK] Augmented label encoders saved to: {new_encoders_path}")
    
    # Save metadata for the new model
    new_metadata = {
        'model_type': 'XGBoost Regressor',
        'description': 'Model trained with shop assortment features',
        'source_data': data_path,
        'target_transformation': 'log1p',
        'cv_methodology': 'Stratified by selection count (5 splits)',
        'performance_log_target': {
            'stratified_cv_r2_mean': r2_cv_mean,
            'stratified_cv_r2_std': r2_cv_std if 'r2_cv_std' in locals() else None,
            'validation_r2': r2_val_log if 'r2_val_log' in locals() else None,
            'overfitting': overfitting if 'overfitting' in locals() else None
        },
        'performance_original_scale_validation': {
            'mae': mae_original if 'mae_original' in locals() else None,
            'rmse': rmse_original if 'rmse_original' in locals() else None,
            'r2': r2_original_val if 'r2_original_val' in locals() else None
        },
        'features': list(X.columns),
        'training_data_size': len(X_train) if 'X_train' in locals() else len(X),
        'feature_importance': dict(zip(importance_df['feature'], importance_df['importance'])) if 'importance_df' in locals() else None
    }
    new_metadata_path = os.path.join(model_dir, 'augmented_model_metadata.pkl')
    with open(new_metadata_path, 'wb') as f:
        pickle.dump(new_metadata, f)
    print(f"[OK] Augmented model metadata saved to: {new_metadata_path}")
else:
    if 'r2_cv_mean' in locals():
        print(f"\n[NO SAVE] New model performance ({r2_cv_mean:.4f}) does not exceed threshold ({save_model_threshold:.4f}). Model not saved.")
    else:
        print("\n[NO SAVE] Model not trained or performance metrics unavailable. Model not saved.")

## Next Steps

1. Analyze the R² score and feature importances.
2. If performance has improved significantly, this model can become the new production candidate.
3. If improvement is modest, consider Phase 3 (Advanced Approaches) from the plan.