# 05 - Submission Generator

This notebook:
- Loads trained models
- Generates predictions on test set
- Creates final submission file
- Validates submission format

This is the final step to generate competition-ready predictions.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from data_loading import load_processed_data
from model_training import load_model
from utils import create_submission

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Test Data

In [None]:
# Load fully engineered test data
test = load_processed_data('../data/processed/test_with_geospatial.parquet')

print(f"Test data shape: {test.shape}")
print(f"Features: {test.shape[1]}")

## 2. Prepare Test Features

In [None]:
# Store IDs for submission
if 'uid' in test.columns:
    test_ids = test['uid'].copy()
elif 'id' in test.columns:
    test_ids = test['id'].copy()
else:
    print("Warning: No ID column found. Creating sequential IDs.")
    test_ids = pd.Series(range(len(test)), name='uid')

In [None]:
# Drop non-feature columns
drop_cols = ['uid', 'id', 'date', 'timestamp', 'target']
drop_cols = [col for col in drop_cols if col in test.columns]

X_test = test.drop(columns=drop_cols)

print(f"Test features shape: {X_test.shape}")
print(f"Dropped columns: {drop_cols}")

## 3. Load Trained Models

In [None]:
# Load the model trained on full training data
model_full = load_model('../models/xgboost_full.pkl')

# Optionally load the validated model for comparison
model_validated = load_model('../models/xgboost_final.pkl')

## 4. Generate Predictions

In [None]:
# Generate predictions with full model
print("Generating predictions with full model...")
predictions_full = model_full.predict(X_test)

print(f"Predictions generated: {len(predictions_full)}")
print(f"Prediction stats:")
print(f"  Min: {predictions_full.min():.4f}")
print(f"  Max: {predictions_full.max():.4f}")
print(f"  Mean: {predictions_full.mean():.4f}")
print(f"  Median: {np.median(predictions_full):.4f}")
print(f"  Std: {predictions_full.std():.4f}")

In [None]:
# Generate predictions with validated model for comparison
print("\nGenerating predictions with validated model...")
predictions_validated = model_validated.predict(X_test)

print(f"Prediction stats:")
print(f"  Min: {predictions_validated.min():.4f}")
print(f"  Max: {predictions_validated.max():.4f}")
print(f"  Mean: {predictions_validated.mean():.4f}")
print(f"  Median: {np.median(predictions_validated):.4f}")
print(f"  Std: {predictions_validated.std():.4f}")

## 5. Compare Predictions

In [None]:
# Compare predictions from both models
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution comparison
axes[0].hist(predictions_full, bins=50, alpha=0.5, label='Full Model', edgecolor='black')
axes[0].hist(predictions_validated, bins=50, alpha=0.5, label='Validated Model', edgecolor='black')
axes[0].set_xlabel('Predicted Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Distribution Comparison')
axes[0].legend()

# Scatter comparison
axes[1].scatter(predictions_full, predictions_validated, alpha=0.3, s=10)
axes[1].plot([predictions_full.min(), predictions_full.max()],
             [predictions_full.min(), predictions_full.max()],
             'r--', lw=2)
axes[1].set_xlabel('Full Model Predictions')
axes[1].set_ylabel('Validated Model Predictions')
axes[1].set_title('Model Predictions Comparison')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/prediction_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate correlation
correlation = np.corrcoef(predictions_full, predictions_validated)[0, 1]
print(f"\nCorrelation between models: {correlation:.4f}")

## 6. Ensemble Predictions (Optional)

Can average predictions from multiple models for potentially better performance.

In [None]:
# Create ensemble prediction (simple average)
predictions_ensemble = (predictions_full + predictions_validated) / 2

print("Ensemble prediction stats:")
print(f"  Min: {predictions_ensemble.min():.4f}")
print(f"  Max: {predictions_ensemble.max():.4f}")
print(f"  Mean: {predictions_ensemble.mean():.4f}")
print(f"  Median: {np.median(predictions_ensemble):.4f}")
print(f"  Std: {predictions_ensemble.std():.4f}")

## 7. Create Submission Files

In [None]:
# Create submission DataFrame
submission_full = pd.DataFrame({
    'uid': test_ids,
    'target': predictions_full
})

submission_validated = pd.DataFrame({
    'uid': test_ids,
    'target': predictions_validated
})

submission_ensemble = pd.DataFrame({
    'uid': test_ids,
    'target': predictions_ensemble
})

print("Submission DataFrames created")
print(f"Shape: {submission_full.shape}")

In [None]:
# Generate timestamp for versioning
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save submissions
submission_full.to_csv(f'../outputs/submissions/submission_full_{timestamp}.csv', index=False)
submission_validated.to_csv(f'../outputs/submissions/submission_validated_{timestamp}.csv', index=False)
submission_ensemble.to_csv(f'../outputs/submissions/submission_ensemble_{timestamp}.csv', index=False)

# Also save as primary submission
submission_full.to_csv('../outputs/submissions/submission.csv', index=False)

print(f"\nSubmission files saved:")
print(f"  - submission_full_{timestamp}.csv")
print(f"  - submission_validated_{timestamp}.csv")
print(f"  - submission_ensemble_{timestamp}.csv")
print(f"  - submission.csv (primary)")

## 8. Validate Submission Format

In [None]:
# Load and validate submission
submission_check = pd.read_csv('../outputs/submissions/submission.csv')

print("Submission Validation:")
print(f"  Shape: {submission_check.shape}")
print(f"  Columns: {list(submission_check.columns)}")
print(f"  Missing values: {submission_check.isnull().sum().sum()}")
print(f"  Duplicate IDs: {submission_check['uid'].duplicated().sum()}")
print(f"  Data types: {submission_check.dtypes.to_dict()}")

# Display first few rows
print("\nFirst 10 rows:")
print(submission_check.head(10))

# Check for anomalies
if (submission_check['target'] < 0).any():
    print("\nWarning: Negative predictions detected!")

if submission_check['target'].isnull().any():
    print("\nError: Missing predictions detected!")
else:
    print("\nâœ“ Submission format validated successfully!")

## 9. Prediction Distribution Visualization

In [None]:
# Visualize final predictions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(predictions_full, bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Predicted Value')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Final Predictions Distribution')
axes[0, 0].axvline(predictions_full.mean(), color='red', linestyle='--', label='Mean')
axes[0, 0].axvline(np.median(predictions_full), color='green', linestyle='--', label='Median')
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(predictions_full, vert=True)
axes[0, 1].set_ylabel('Predicted Value')
axes[0, 1].set_title('Predictions Box Plot')
axes[0, 1].grid(True, alpha=0.3)

# QQ plot
from scipy import stats
stats.probplot(predictions_full, dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot')
axes[1, 0].grid(True, alpha=0.3)

# Cumulative distribution
sorted_preds = np.sort(predictions_full)
cumulative = np.arange(1, len(sorted_preds) + 1) / len(sorted_preds)
axes[1, 1].plot(sorted_preds, cumulative)
axes[1, 1].set_xlabel('Predicted Value')
axes[1, 1].set_ylabel('Cumulative Probability')
axes[1, 1].set_title('Cumulative Distribution')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/final_predictions_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Summary Report

In [None]:
# Generate summary report
print("="*70)
print("SUBMISSION GENERATION SUMMARY")
print("="*70)
print(f"\nDate: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nTest Set Size: {len(test_ids)}")
print(f"Number of Features: {X_test.shape[1]}")
print(f"\nModels Used:")
print(f"  1. XGBoost Full (trained on 100% data)")
print(f"  2. XGBoost Validated (80/20 split)")
print(f"  3. Ensemble (average of above)")
print(f"\nPrediction Statistics (Full Model):")
print(f"  Count: {len(predictions_full)}")
print(f"  Mean: {predictions_full.mean():.4f}")
print(f"  Std: {predictions_full.std():.4f}")
print(f"  Min: {predictions_full.min():.4f}")
print(f"  25%: {np.percentile(predictions_full, 25):.4f}")
print(f"  50%: {np.percentile(predictions_full, 50):.4f}")
print(f"  75%: {np.percentile(predictions_full, 75):.4f}")
print(f"  Max: {predictions_full.max():.4f}")
print(f"\nSubmission Files:")
print(f"  Primary: outputs/submissions/submission.csv")
print(f"  Versioned: outputs/submissions/submission_*_{timestamp}.csv")
print(f"\n" + "="*70)
print("Submission generation complete!")
print("="*70)

## Summary

This notebook successfully:
1. Loaded fully engineered test data
2. Generated predictions using trained models
3. Created multiple submission files:
   - Full model (recommended)
   - Validated model
   - Ensemble model
4. Validated submission format
5. Analyzed prediction distributions

**Next Steps:**
- Submit `submission.csv` to competition
- Monitor leaderboard performance
- Iterate on features/models if needed

**Recommended Submission:** `submission_full_{timestamp}.csv`