# 04 â€” Stage 1 Training (TCN + GRU)
Train the HybridEncoder using purged walk-forward CV with AMP.
Saves model checkpoints and latent vectors per fold.

In [None]:
!pip install -q torch xgboost ccxt PyWavelets pandas-ta hmmlearn numba scikit-learn pyyaml tensorboard tqdm pyarrow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys, os, json
REPO_DIR = '/content/scalp2'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/<YOUR_USERNAME>/scalp2.git {REPO_DIR}
sys.path.insert(0, REPO_DIR)

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import RobustScaler

from scalp2.config import load_config
config = load_config(f'{REPO_DIR}/config.yaml')

DATA_DIR = '/content/drive/MyDrive/scalp2/data/processed'
CHECKPOINT_DIR = '/content/drive/MyDrive/scalp2/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')
print(f'CUDA available: {torch.cuda.is_available()}')

In [None]:
# Load labeled dataset
df = pd.read_parquet(f'{DATA_DIR}/BTC_USDT_labeled.parquet')
with open(f'{DATA_DIR}/feature_columns.json', 'r') as f:
    feature_cols = json.load(f)

print(f'Dataset: {len(df)} rows, {len(feature_cols)} features')
print(f'Labels: {df["tb_label_cls"].value_counts().sort_index().to_dict()}')

In [None]:
from scalp2.training.walk_forward import PurgedWalkForwardCV
from scalp2.models.hybrid import HybridEncoder
from scalp2.training.trainer import Stage1Trainer
from scalp2.regime.hmm import RegimeDetector
from scalp2.utils.serialization import save_fold_artifacts
from scalp2.utils.memory import log_gpu_memory, estimate_batch_memory

cv = PurgedWalkForwardCV(config.training.walk_forward)
n_folds = cv.n_folds(len(df))
print(f'Walk-forward: {n_folds} folds')

# Pre-flight memory check
n_features = len(feature_cols)
mem_est = estimate_batch_memory(
    config.training.batch_size, config.model.seq_len,
    n_features, 315000, config.training.use_amp
)
print(f'Estimated GPU memory: {mem_est["total_estimated_mb"]:.0f} MB (fits T4: {mem_est["fits_t4"]})')

In [None]:
# Resume support: check which folds are already done
completed_folds = set()
for p in os.listdir(CHECKPOINT_DIR) if os.path.exists(CHECKPOINT_DIR) else []:
    if p.startswith('fold_') and os.path.isdir(os.path.join(CHECKPOINT_DIR, p)):
        try:
            idx = int(p.split('_')[1])
            completed_folds.add(idx)
        except ValueError:
            pass
print(f'Already completed folds: {sorted(completed_folds)}')

In [None]:
# Main training loop
all_results = []
features_array = df[feature_cols].values
labels_array = df['tb_label_cls'].values
returns_array = df['tb_return'].values

for fold in cv.split(len(df)):
    if fold.fold_idx in completed_folds:
        print(f'Skipping fold {fold.fold_idx} (already completed)')
        continue
    
    print(f'\n{"="*60}')
    print(f'FOLD {fold.fold_idx}/{n_folds-1}')
    print(f'Train: [{fold.train_start}:{fold.train_end}] Val: [{fold.val_start}:{fold.val_end}] Test: [{fold.test_start}:{fold.test_end}]')
    print(f'{"="*60}')
    
    # Extract fold data
    train_feat = features_array[fold.train_start:fold.train_end]
    train_labels = labels_array[fold.train_start:fold.train_end]
    train_returns = returns_array[fold.train_start:fold.train_end]
    
    val_feat = features_array[fold.val_start:fold.val_end]
    val_labels = labels_array[fold.val_start:fold.val_end]
    val_returns = returns_array[fold.val_start:fold.val_end]
    
    # Scale features (fit on train only)
    scaler = RobustScaler()
    train_feat_scaled = scaler.fit_transform(train_feat).astype(np.float32)
    val_feat_scaled = scaler.transform(val_feat).astype(np.float32)
    
    # Fit HMM regime detector on training data
    regime_detector = RegimeDetector(config.regime)
    train_df_for_regime = df.iloc[fold.train_start:fold.train_end]
    regime_detector.fit(train_df_for_regime)
    
    # Initialize model (fresh for each fold)
    model = HybridEncoder(n_features, config.model)
    print(f'Model parameters: {model.count_parameters():,}')
    
    # Train
    trainer = Stage1Trainer(model, config.training, checkpoint_dir=CHECKPOINT_DIR)
    result = trainer.train_one_fold(
        train_feat_scaled, train_labels, train_returns,
        val_feat_scaled, val_labels, val_returns,
        fold_idx=fold.fold_idx,
        seq_len=config.model.seq_len,
    )
    
    # Save all artifacts
    save_fold_artifacts(
        CHECKPOINT_DIR, fold.fold_idx,
        model.state_dict(), scaler,
        np.array([]),  # top_feature_indices set in stage 2
        feature_cols,
        regime_detector,
        metadata={'result': {k: v for k, v in result.items() if k != 'history'}},
    )
    
    all_results.append(result)
    log_gpu_memory(f'Fold {fold.fold_idx} done')
    
    # Clear GPU memory
    del model, trainer
    torch.cuda.empty_cache()

print(f'\nAll folds complete. Results: {len(all_results)} folds trained.')