# 📋 Create Training Manifest

Create a comprehensive manifest of the training run.

In [None]:
def create_training_manifest(artifacts, timestamp, symbol, cfg, repo_path):
    """Create comprehensive training manifest"""
    
    # Create runs directory
    runs_path = Path(repo_path) / 'runs' / f'colab-{timestamp}'
    runs_path.mkdir(parents=True, exist_ok=True)
    
    # Collect model summaries
    model_summaries = []
    for artifact in artifacts:
        # Load metadata
        meta_file = Path(artifact['path']) / 'meta.json'
        if meta_file.exists():
            with open(meta_file, 'r') as f:
                meta = json.load(f)
            model_summaries.append({
                'model_id': artifact['model_id'],
                'type': meta.get('model_type', 'unknown'),
                'task': meta.get('task_type', 'unknown'),
                'performance': meta.get('performance', {}),
                'size_mb': artifact['size_mb']
            })
    
    # Create manifest
    manifest = {
        'run_info': {
            'timestamp': timestamp,
            'environment': 'google_colab',
            'symbol': symbol,
            'config': cfg,
            'commit_hash': get_git_commit_hash(repo_path),
            'dataset_hash': calculate_dataset_hash(X, y_binary)
        },
        'dataset_info': {
            'n_samples': len(X),
            'n_features': len(X.columns),
            'feature_names': list(X.columns),
            'class_distribution': y_binary.value_counts().to_dict(),
            'time_range': {
                'start': str(timestamps.iloc[0]),
                'end': str(timestamps.iloc[-1])
            }
        },
        'models': model_summaries,
        'artifacts': {
            'repository_path': MODEL_SAVE_REPO_PATH,
            'drive_path': MODEL_SAVE_DRIVE_PATH if DRIVE_MOUNTED else None,
            'total_models': len(artifacts),
            'total_size_mb': sum(a['size_mb'] for a in artifacts)
        },
        'system_info': {
            'python_version': sys.version.split()[0],
            'packages': {
                'lightgbm': getattr(lgb, '__version__', 'unknown'),
                'xgboost': getattr(xgb, '__version__', 'unknown') if XGB_AVAILABLE else 'not_available',
                'sklearn': '1.0+',  # Approximate
                'pandas': pd.__version__,
                'numpy': np.__version__
            }
        }
    }
    
    # Save manifest
    manifest_file = runs_path / 'manifest.json'
    with open(manifest_file, 'w') as f:
        json.dump(manifest, f, indent=2, default=str)
    
    print(f"✅ Training manifest saved to: {manifest_file}")
    return manifest, str(manifest_file)

# Create training manifest
manifest, manifest_path = create_training_manifest(
    repo_artifacts, training_timestamp, SYMBOL, CFG, REPO_PATH
)

# 📋 Display Results Summary

Show a comprehensive summary of the training results.

In [None]:
def display_training_summary(manifest, artifacts):
    """Display comprehensive training summary"""
    
    print("\n" + "=" * 60)
    print("🎯 TRAINING SUMMARY")
    print("=" * 60)
    
    # Run Info
    print(f"\n📅 Training Run: {manifest['run_info']['timestamp']}")
    print(f"💱 Symbol: {manifest['run_info']['symbol']}")
    print(f"🧪 Mode: {'Fast Test' if manifest['run_info']['config']['fast_test'] else 'Full Training'}")
    print(f"📊 Dataset: {manifest['dataset_info']['n_samples']} samples, {manifest['dataset_info']['n_features']} features")
    
    # Model Performance
    print(f"\n🏆 Model Performance:")
    print("-" * 40)
    
    for model in manifest['models']:
        model_name = model['model_id'].replace('_', ' ').title()
        perf = model.get('performance', {})
        
        if model['task'] == 'classification':
            auc = perf.get('auc', 0)
            print(f"{model_name:<20} | AUC: {auc:.4f} | Size: {model['size_mb']:.1f} MB")
        else:
            rmse = perf.get('rmse', 0)
            r2 = perf.get('r2', 0)
            print(f"{model_name:<20} | RMSE: {rmse:.6f} | R²: {r2:.4f} | Size: {model['size_mb']:.1f} MB")
    
    # Artifacts
    print(f"\n📦 Artifacts Summary:")
    print(f"  Models Trained: {len(artifacts)}")
    print(f"  Total Size: {sum(a['size_mb'] for a in artifacts):.1f} MB")
    print(f"  Repository Path: {MODEL_SAVE_REPO_PATH}")
    if DRIVE_MOUNTED:
        print(f"  Google Drive Path: {MODEL_SAVE_DRIVE_PATH}")
    
    # File Listing
    print(f"\n📂 Saved Artifacts:")
    for artifact in artifacts:
        print(f"  📁 {artifact['model_id']}/")
        for file in artifact['files']:
            print(f"     📄 {file}")
    
    print("\n" + "=" * 60)

# Display summary
display_training_summary(manifest, repo_artifacts)

# Display sample metadata
if repo_artifacts:
    sample_artifact = repo_artifacts[0]
    sample_meta_path = Path(sample_artifact['path']) / 'meta.json'
    
    if sample_meta_path.exists():
        print(f"\n📄 Sample Metadata ({sample_artifact['model_id']}):")
        print("-" * 30)
        with open(sample_meta_path, 'r') as f:
            sample_meta = json.load(f)
        
        # Display key metadata fields
        key_fields = ['model_name', 'task_type', 'timestamp', 'performance', 'n_features', 'n_samples']
        for field in key_fields:
            if field in sample_meta:
                value = sample_meta[field]
                if isinstance(value, dict):
                    print(f"  {field}: {json.dumps(value, indent=4)}")
                else:
                    print(f"  {field}: {value}")

# ✅ Validate Model Artifacts

Test that saved models can be loaded and used for inference.

In [None]:
def validate_model_artifact(artifact_path, model_id, X_sample):
    """Validate that a model artifact can be loaded and used"""
    
    try:
        model_path = Path(artifact_path)
        
        # Check files exist
        model_file = model_path / 'model.pkl'
        scaler_file = model_path / 'scaler.pkl'
        meta_file = model_path / 'meta.json'
        
        if not all(f.exists() for f in [model_file, scaler_file, meta_file]):
            return False, "Missing required files"
        
        # Load metadata
        with open(meta_file, 'r') as f:
            metadata = json.load(f)
        
        # Load model and scaler
        model = joblib.load(model_file)
        scaler = joblib.load(scaler_file)
        
        # Test inference
        X_scaled = scaler.transform(X_sample)
        
        if 'xgb' in metadata.get('model_type', '').lower():
            # XGBoost model
            dtest = xgb.DMatrix(X_scaled)
            prediction = model.predict(dtest)
        else:
            # LightGBM or sklearn model
            prediction = model.predict(X_scaled)
        
        # Validate prediction format
        if metadata.get('task_type') == 'classification':
            # Should be probabilities [0,1]
            if prediction.min() < 0 or prediction.max() > 1:
                # Apply sigmoid if needed
                prediction = 1 / (1 + np.exp(-prediction))
            result_desc = f"P(positive) = {prediction[0]:.4f}"
        else:
            # Regression - should be reasonable returns
            result_desc = f"Predicted return = {prediction[0]:.6f}"
        
        return True, result_desc
        
    except Exception as e:
        return False, f"Error: {e}"

print("🔍 Validating model artifacts...")
print("=" * 40)

# Test with first row of test data
X_sample = binary_splits['X_test'].iloc[:1]

validation_results = []
for artifact in repo_artifacts:
    model_id = artifact['model_id']
    print(f"\n🔄 Testing {model_id}...")
    
    success, message = validate_model_artifact(
        artifact['path'], model_id, X_sample
    )
    
    if success:
        print(f"✅ {model_id}: {message}")
        validation_results.append({'model_id': model_id, 'status': 'success', 'message': message})
    else:
        print(f"❌ {model_id}: {message}")
        validation_results.append({'model_id': model_id, 'status': 'failed', 'message': message})

# Summary
successful_validations = sum(1 for r in validation_results if r['status'] == 'success')
total_validations = len(validation_results)

print(f"\n📊 Validation Summary: {successful_validations}/{total_validations} models passed")

if successful_validations == total_validations:
    print("✅ All model artifacts validated successfully!")
else:
    print("⚠️  Some artifacts failed validation - check the errors above")

# 📥 Download Artifacts

Create a ZIP file of all artifacts and download to your local machine.

In [None]:
def create_artifacts_zip(artifacts, manifest_path, symbol, timestamp):
    """Create ZIP file of all artifacts for download"""
    
    import zipfile
    from pathlib import Path
    
    zip_filename = f"trading_bot_models_{symbol}_{timestamp}.zip"
    zip_path = Path("/content") / zip_filename
    
    print(f"📦 Creating ZIP archive: {zip_filename}")
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Add manifest
        zipf.write(manifest_path, f"runs/colab-{timestamp}/manifest.json")
        print(f"  ✅ Added manifest")
        
        # Add all model artifacts
        for artifact in artifacts:
            model_path = Path(artifact['path'])
            model_id = artifact['model_id']
            
            for file in model_path.glob('*'):
                if file.is_file():
                    # Create archive path maintaining structure
                    archive_path = f"models/{symbol}/{timestamp}/{model_id}/{file.name}"
                    zipf.write(file, archive_path)
            
            print(f"  ✅ Added {model_id} artifacts")
        
        # Add summary file
        summary_content = f"""# Trading Bot ML Training Results

**Timestamp:** {timestamp}
**Symbol:** {symbol}
**Environment:** Google Colab

## Models Trained:
{chr(10).join([f"- {a['model_id']} ({a['size_mb']:.1f} MB)" for a in artifacts])}

## Usage Instructions:

1. Extract this ZIP file to your local repository
2. Models are organized as: models/{symbol}/{timestamp}/{model_id}/
3. Each model directory contains:
   - model.pkl: The trained model
   - scaler.pkl: Feature scaler
   - meta.json: Model metadata and performance

## Loading Models:

```python
import joblib
import json

# Load model
model = joblib.load('models/{symbol}/{timestamp}/{{model_id}}/model.pkl')
scaler = joblib.load('models/{symbol}/{timestamp}/{{model_id}}/scaler.pkl')

# Load metadata
with open('models/{symbol}/{timestamp}/{{model_id}}/meta.json', 'r') as f:
    metadata = json.load(f)

# Make predictions
X_scaled = scaler.transform(your_features)
predictions = model.predict(X_scaled)
```

Generated by Trading Bot ML Training Pipeline
"""
        
        summary_path = "/tmp/README.md"
        with open(summary_path, 'w') as f:
            f.write(summary_content)
        zipf.write(summary_path, "README.md")
    
    zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
    print(f"📦 ZIP created: {zip_filename} ({zip_size_mb:.1f} MB)")
    
    return str(zip_path)

def trigger_download(file_path):
    """Trigger file download in Colab"""
    try:
        from google.colab import files
        print(f"📥 Starting download: {Path(file_path).name}")
        files.download(file_path)
        print("✅ Download initiated - check your browser downloads")
    except ImportError:
        print("⚠️  Not running in Colab - download skipped")
        print(f"File available at: {file_path}")
    except Exception as e:
        print(f"⚠️  Download failed: {e}")
        print(f"File available at: {file_path}")

# Create and download artifacts ZIP
if repo_artifacts:
    zip_path = create_artifacts_zip(repo_artifacts, manifest_path, SYMBOL, training_timestamp)
    
    print("\n🎯 Download ready!")
    print("Click the download link above or run the next cell to trigger download.")
    
    # Store zip path for download cell
    ARTIFACTS_ZIP_PATH = zip_path
else:
    print("❌ No artifacts to download")

In [None]:
# Trigger download (run this cell to download the ZIP file)
if 'ARTIFACTS_ZIP_PATH' in globals():
    trigger_download(ARTIFACTS_ZIP_PATH)
else:
    print("⚠️  No ZIP file available for download")

# 🎉 Training Complete!

## ✅ What Was Accomplished:

1. **Repository Setup**: Cloned your trading bot repository and set up the environment
2. **Dependencies**: Installed all required ML packages (LightGBM, XGBoost, etc.)
3. **Data Preparation**: Created training dataset with engineered features
4. **Model Training**: Trained baseline LightGBM and XGBoost models
5. **Model Evaluation**: Tested models on hold-out test set
6. **Artifact Storage**: Saved models with metadata to repository and Google Drive
7. **Validation**: Verified all artifacts can be loaded and used for inference
8. **Download**: Created downloadable ZIP with all artifacts

## 📁 Artifact Structure:

```
models/
├── {symbol}/
│   └── {timestamp}/
│       ├── lightgbm_binary/
│       │   ├── model.pkl
│       │   ├── scaler.pkl
│       │   └── meta.json
│       └── lightgbm_regression/
│           ├── model.pkl
│           ├── scaler.pkl
│           └── meta.json
runs/
└── colab-{timestamp}/
    └── manifest.json
```

## 🚀 Next Steps:

1. **Download** the ZIP file using the cell above
2. **Extract** to your local repository
3. **Integrate** with your backtesting system
4. **Deploy** using your existing inference pipeline
5. **Monitor** model performance in production

## 🔧 Advanced Development:

- Add **CatBoost** and **Random Forest** to the ensemble
- Implement **stacked ensembles** for better performance
- Use **Optuna** for hyperparameter optimization
- Add **time series cross-validation** with purging
- Implement **SHAP analysis** for model interpretability

**Happy Trading! 📈**