# 05 â€” Stage 2 Training (XGBoost Meta-Learner)
Extract latent vectors from trained Stage-1 models, combine with
handcrafted features and regime probabilities, train XGBoost.

In [None]:
!pip install -q torch xgboost ccxt PyWavelets pandas-ta hmmlearn numba scikit-learn pyyaml 'numpy>=1.26.0,<2.2.0' 'pandas==2.2.2' tqdm pyarrow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys, os, json
REPO_DIR = '/content/scalp2'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/<YOUR_USERNAME>/scalp2.git {REPO_DIR}
sys.path.insert(0, REPO_DIR)

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import RobustScaler

from scalp2.config import load_config
config = load_config(f'{REPO_DIR}/config.yaml')

DATA_DIR = '/content/drive/MyDrive/scalp2/data/processed'
CHECKPOINT_DIR = '/content/drive/MyDrive/scalp2/checkpoints'

In [None]:
# Load dataset
df = pd.read_parquet(f'{DATA_DIR}/BTC_USDT_labeled.parquet')
with open(f'{DATA_DIR}/feature_columns.json', 'r') as f:
    feature_cols = json.load(f)

features_array = df[feature_cols].values
labels_array = df['tb_label_cls'].values
returns_array = df['tb_return'].values
print(f'Dataset: {len(df)} rows, {len(feature_cols)} features')

In [None]:
from scalp2.training.walk_forward import PurgedWalkForwardCV
from scalp2.models.hybrid import HybridEncoder
from scalp2.training.trainer import Stage1Trainer
from scalp2.training.stage2_trainer import Stage2Trainer
from scalp2.regime.hmm import RegimeDetector
from scalp2.utils.serialization import load_fold_artifacts
from scalp2.utils.metrics import evaluate_predictions

cv = PurgedWalkForwardCV(config.training.walk_forward)
stage2 = Stage2Trainer(config, checkpoint_dir=CHECKPOINT_DIR)
n_features = len(feature_cols)

all_test_results = []

for fold in cv.split(len(df)):
    print(f'\n--- Fold {fold.fold_idx} ---')
    
    # Load Stage-1 artifacts
    artifacts = load_fold_artifacts(CHECKPOINT_DIR, fold.fold_idx)
    
    # Reconstruct model and load weights
    model = HybridEncoder(n_features, config.model)
    model.load_state_dict(artifacts['model_state'])
    
    trainer = Stage1Trainer(model, config.training, checkpoint_dir=CHECKPOINT_DIR)
    regime_detector = artifacts.get('regime_detector')
    
    if regime_detector is None:
        regime_detector = RegimeDetector(config.regime)
        regime_detector.fit(df.iloc[fold.train_start:fold.train_end])
    
    # Scale features
    scaler = artifacts['scaler']
    train_scaled = scaler.transform(features_array[fold.train_start:fold.train_end]).astype(np.float32)
    val_scaled = scaler.transform(features_array[fold.val_start:fold.val_end]).astype(np.float32)
    test_scaled = scaler.transform(features_array[fold.test_start:fold.test_end]).astype(np.float32)
    
    # Run Stage 2
    result = stage2.train_one_fold(
        trainer, regime_detector,
        train_scaled, labels_array[fold.train_start:fold.train_end],
        val_scaled, labels_array[fold.val_start:fold.val_end],
        test_scaled, labels_array[fold.test_start:fold.test_end],
        df.iloc[fold.train_start:fold.train_end],
        df.iloc[fold.val_start:fold.val_end],
        df.iloc[fold.test_start:fold.test_end],
        feature_cols, fold.fold_idx,
    )
    
    # Evaluate
    metrics = evaluate_predictions(
        result['test_probabilities'], result['test_labels'],
        returns_array[fold.test_start + config.model.seq_len:fold.test_end][:len(result['test_labels'])],
        config.execution.confidence_threshold,
    )
    print(f'Fold {fold.fold_idx} metrics: {metrics}')
    
    result['metrics'] = metrics
    all_test_results.append(result)
    
    # Clean up
    del model, trainer
    torch.cuda.empty_cache()

In [None]:
# Aggregate results
metrics_df = pd.DataFrame([r['metrics'] for r in all_test_results if 'n_trades' in r['metrics'] and r['metrics']['n_trades'] > 0])
print('\n=== Aggregate Walk-Forward Results ===')
print(metrics_df.describe().round(4))
print(f'\nMean Sharpe: {metrics_df["sharpe"].mean():.4f}')
print(f'Mean Win Rate: {metrics_df["win_rate"].mean():.4f}')
print(f'Mean Trades/Day: {metrics_df["trades_per_day"].mean():.2f}')