# Gavefabrikken Demand Prediction - Model Training & Analysis

This notebook demonstrates the complete pipeline for training the XGBoost demand prediction model using historical gift selection data.

## Overview
- **Data Loading**: Load historical gift selection data with robust encoding handling
- **Data Preprocessing**: Aggregate selection events and prepare features
- **Model Training**: Train XGBoost model for demand prediction
- **Evaluation**: Analyze model performance and feature importance
- **Business Insights**: Extract actionable insights from the results

## 1. Setup and Imports

In [None]:
import sys
import os

# Add the project root to the path
project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# Import our custom modules
from src.ml.model import DemandPredictor

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("✅ All imports successful!")

## 2. Data Loading with Robust Encoding Handling

In [None]:
# Load historical data with robust encoding handling
historical_data_path = "../src/data/historical/present.selection.historic.csv"

print("📂 Loading historical data with robust encoding handling...")

# Try different encodings
encodings_to_try = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-16']
raw_data = None

for encoding in encodings_to_try:
    try:
        raw_data = pd.read_csv(historical_data_path, encoding=encoding)
        print(f"✅ Successfully loaded with {encoding} encoding")
        break
    except UnicodeDecodeError:
        print(f"❌ Failed with {encoding} encoding")
        continue
    except Exception as e:
        print(f"❌ Error with {encoding}: {e}")
        continue

if raw_data is None:
    raise ValueError(f"Could not load CSV with any encoding: {encodings_to_try}")

print(f"\n📊 Raw Data Shape: {raw_data.shape}")
print(f"📊 Total selection events: {len(raw_data)}")
print(f"📊 Features: {raw_data.shape[1]} columns")

print("\n🔍 First 5 rows:")
raw_data.head()

In [None]:
# Clean the data
print("🧹 Cleaning data...")

# Remove quotes and clean string columns
string_columns = raw_data.select_dtypes(include=['object']).columns
for col in string_columns:
    raw_data[col] = raw_data[col].astype(str).str.strip('"').str.strip()

# Handle missing values
raw_data = raw_data.fillna("NONE")

# Standardize categorical values
categorical_columns = ['employee_gender', 'product_target_gender', 'product_utility_type', 'product_durability', 'product_type']
for col in categorical_columns:
    if col in raw_data.columns:
        raw_data[col] = raw_data[col].str.lower()

print("✅ Data cleaned successfully!")
print(f"\n📊 Data Summary:")
print(f"Total selection events: {len(raw_data)}")
print(f"Unique employees by gender: {raw_data['employee_gender'].value_counts().to_dict()}")
print(f"Unique product categories: {raw_data['product_main_category'].nunique()}")
print(f"Unique brands: {raw_data['product_brand'].nunique()}")

## 3. Data Aggregation

In [None]:
# Aggregate selection events by counting occurrences
print("🔄 Aggregating selection events...")

# Define grouping columns (all categorical features)
grouping_columns = [
    'employee_shop', 'employee_branch', 'employee_gender',
    'product_main_category', 'product_sub_category', 'product_brand',
    'product_color', 'product_durability', 'product_target_gender',
    'product_utility_type', 'product_type'
]

# Aggregate by counting selection events
aggregated_data = raw_data.groupby(grouping_columns).size().reset_index(name='selection_count')

print(f"\n📊 Aggregation Results:")
print(f"Original events: {len(raw_data)} → Unique combinations: {len(aggregated_data)}")
print(f"This means {len(raw_data) - len(aggregated_data)} events were duplicates")

print("\n📈 Selection Count Distribution:")
selection_dist = aggregated_data['selection_count'].value_counts().sort_index()
print(selection_dist)

print("\n🔍 First 10 aggregated combinations:")
aggregated_data.head(10)

## 4. Feature Engineering

In [None]:
# Create training features using label encoding
print("⚙️ Creating training features with label encoding...")

# Separate features and target
X = aggregated_data[grouping_columns].copy()
y = aggregated_data['selection_count']

# Label encode categorical features
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
        label_encoders[column] = le

print(f"\n📊 Training Data Shape:")
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"Sample-to-feature ratio: {len(X) / len(X.columns):.1f}:1")

print(f"\n📊 Target Variable Statistics:")
print(f"Mean selections: {y.mean():.2f}")
print(f"Max selections: {y.max()}")
print(f"Min selections: {y.min()}")

print("\n🔍 Encoded Features (first 5 rows):")
X.head()

In [None]:
# Show label encoder mappings
print("🏷️ Label Encoder Mappings:")
print("=" * 50)

for column, encoder in label_encoders.items():
    mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    print(f"\n{column}:")
    for original, encoded in list(mapping.items())[:5]:  # Show first 5
        print(f"  '{original}' → {encoded}")
    if len(mapping) > 5:
        print(f"  ... and {len(mapping) - 5} more")

## 5. Model Training

In [None]:
# Train the XGBoost model
print("🚀 Training XGBoost model...")

model = DemandPredictor()
training_stats = model.train(X, y, validation_split=0.2)

print("\n✅ Training completed!")

# Display training results
if training_stats.get('small_dataset_warning', False):
    print("\n⚠️ Small dataset detected - used full dataset for training")
    print("📊 Training Metrics:")
    for metric, value in training_stats['train_metrics'].items():
        print(f"  {metric.upper()}: {value:.4f}")
else:
    print("\n📊 Training Metrics:")
    for metric, value in training_stats['train_metrics'].items():
        print(f"  {metric.upper()}: {value:.4f}")
    
    print("\n📊 Validation Metrics:")
    for metric, value in training_stats['validation_metrics'].items():
        print(f"  {metric.upper()}: {value:.4f}")

## 6. Feature Importance Analysis

In [None]:
# Analyze feature importance
print("🎯 Feature Importance Analysis")
print("=" * 50)

feature_importance = model.get_feature_importance()

print("\n🏆 Feature Importance Rankings:")
for i, (feature, importance) in enumerate(list(feature_importance.items()), 1):
    print(f"  {i:2d}. {feature}: {importance:.6f} ({importance*100:.2f}%)")

# Check if we have meaningful feature importance
max_importance = max(feature_importance.values()) if feature_importance else 0
print(f"\n📊 Maximum feature importance: {max_importance:.6f}")

if max_importance > 0.01:
    print("\n✅ Great! We have meaningful feature importance values.")
    print("This indicates the model learned patterns from the data.")
else:
    print("\n⚠️ Feature importance values are very low.")
    print("This might indicate insufficient training data or data quality issues.")

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))

# Get top features for plotting
top_n = min(10, len(feature_importance))
top_features = dict(list(feature_importance.items())[:top_n])
features = list(top_features.keys())
importance_scores = list(top_features.values())

# Create horizontal bar plot
y_pos = np.arange(len(features))
plt.barh(y_pos, importance_scores, color='steelblue', alpha=0.7)
plt.yticks(y_pos, features)
plt.xlabel('Feature Importance Score')
plt.title(f'Top {top_n} Feature Importance (XGBoost Model)')
plt.gca().invert_yaxis()  # Highest importance at top

# Add value labels on bars
for i, v in enumerate(importance_scores):
    plt.text(v + max(importance_scores)*0.01, i, f'{v:.3f}', va='center')

plt.tight_layout()
plt.show()

print(f"📊 Plotted top {top_n} features by importance")

## 7. Model Evaluation and Predictions

In [None]:
# Make predictions and evaluate
print("🔮 Making predictions and evaluating model...")

predictions = model.predict(X)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual': y.values,
    'Predicted': predictions,
    'Difference': y.values - predictions,
    'Abs_Error': np.abs(y.values - predictions)
})

print("\n📊 Prediction Statistics:")
print(f"Mean Absolute Error: {comparison_df['Abs_Error'].mean():.3f}")
print(f"Max Error: {comparison_df['Abs_Error'].max():.3f}")
print(f"Mean Actual: {comparison_df['Actual'].mean():.3f}")
print(f"Mean Predicted: {comparison_df['Predicted'].mean():.3f}")

print("\n🔍 Sample Predictions (first 10):")
print(comparison_df.head(10).round(3))

In [None]:
# Visualize predictions vs actual
plt.figure(figsize=(12, 5))

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(y, predictions, alpha=0.7, color='darkblue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)  # Perfect prediction line
plt.xlabel('Actual Selection Count')
plt.ylabel('Predicted Selection Count')
plt.title('Actual vs Predicted Values')
plt.grid(True, alpha=0.3)

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y - predictions
plt.scatter(predictions, residuals, alpha=0.7, color='darkgreen')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Selection Count')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Business Insights

In [None]:
# Extract business insights
print("🏢 BUSINESS INSIGHTS FROM MODEL")
print("=" * 50)

# Top features with business interpretation
print("\n🎯 Key Factors Driving Gift Selection:")
top_5_features = list(feature_importance.items())[:5]

business_interpretations = {
    'product_main_category': 'Gift type/category is the primary driver',
    'product_durability': 'Durability (consumable vs durable) affects choice',
    'employee_branch': 'Location/branch influences preferences',
    'employee_shop': 'Shop/company culture affects selections',
    'employee_gender': 'Gender influences gift preferences',
    'product_target_gender': 'Target demographic matters for selection',
    'product_utility_type': 'Utility type (practical/aesthetic) drives choice',
    'product_brand': 'Brand preference affects selection',
    'product_sub_category': 'Specific subcategory influences choice'
}

for i, (feature, importance) in enumerate(top_5_features, 1):
    interpretation = business_interpretations.get(feature, 'Significant factor in gift selection')
    print(f"  {i}. {feature} ({importance*100:.1f}%): {interpretation}")

# Category analysis
print("\n📊 Popular Product Categories:")
category_popularity = raw_data['product_main_category'].value_counts().head(5)
for category, count in category_popularity.items():
    print(f"  • {category}: {count} selections")

# Gender preferences
print("\n👥 Employee Demographics:")
gender_dist = raw_data['employee_gender'].value_counts()
for gender, count in gender_dist.items():
    print(f"  • {gender.title()}: {count} selections ({count/len(raw_data)*100:.1f}%)")

## 9. Model Persistence

In [None]:
# Save the trained model
model_save_path = "../models/demand_predictor_production.pkl"
print(f"💾 Saving model to {model_save_path}...")

# Create models directory if it doesn't exist
Path(model_save_path).parent.mkdir(parents=True, exist_ok=True)

# Save model and metadata
model.save_model(model_save_path)
print("✅ Model saved successfully!")

# Save label encoders for future use
import pickle
encoders_path = "../models/label_encoders.pkl"
with open(encoders_path, 'wb') as f:
    pickle.dump(label_encoders, f)
print(f"✅ Label encoders saved to {encoders_path}")

print(f"\n📋 Model Summary:")
print(f"• Training samples: {len(X)}")
print(f"• Features: {len(X.columns)}")
print(f"• Model type: XGBoost Regressor")
print(f"• Ready for production use: ✅")

## 10. Summary and Next Steps

In [None]:
# Final summary
print("📋 TRAINING COMPLETE - SUMMARY")
print("=" * 50)

print(f"\n✅ Successfully trained XGBoost demand prediction model!")
print(f"\n📊 Dataset Statistics:")
print(f"• Raw selection events: {len(raw_data)}")
print(f"• Unique combinations: {len(aggregated_data)}")
print(f"• Features used: {len(X.columns)}")
print(f"• Sample-to-feature ratio: {len(X) / len(X.columns):.1f}:1")

print(f"\n🎯 Model Performance:")
if not training_stats.get('small_dataset_warning', False):
    val_r2 = training_stats['validation_metrics']['r2']
    print(f"• Validation R²: {val_r2:.4f}")
else:
    train_r2 = training_stats['train_metrics']['r2']
    print(f"• Training R²: {train_r2:.4f}")

max_importance = max(feature_importance.values())
print(f"• Max feature importance: {max_importance:.3f}")
print(f"• Feature importance quality: {'Good' if max_importance > 0.1 else 'Moderate' if max_importance > 0.01 else 'Low'}")

print(f"\n🚀 Ready for Production:")
print(f"• Model saved: {model_save_path}")
print(f"• Encoders saved: {encoders_path}")
print(f"• API integration: Ready")
print(f"• Prediction pipeline: Complete")

print(f"\n🎯 Next Steps:")
print(f"• Integrate model with FastAPI endpoints")
print(f"• Connect to three-step processing pipeline")
print(f"• Deploy for real-time demand predictions")
print(f"• Monitor model performance with new data")

print(f"\n🎉 Training pipeline completed successfully!")