In [None]:
import torch
import logging
from torch.utils.data import DataLoader
from horgues3.models import Horgues3Model, PlackettLuceLoss
from horgues3.dataset import Horgues3Dataset
import numpy as np
from horgues3.betting import calculate_betting_probabilities, get_betting_combinations, format_betting_results

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
dataset = Horgues3Dataset(
    start_ymd='20150601',
    end_ymd='20250531',
    max_horses=18
)
dataset.fetch_data().fit_preprocessors().preprocess_data().build_race_data()

train_loader = DataLoader(
    dataset,
    batch_size=128,
    shuffle=True,
    num_workers=0  # Windowsでは0に設定
)

# サンプルデータの確認
sample = dataset[0]
logger.info("Sample data structure:")
for key, value in sample.items():
    if isinstance(value, torch.Tensor):
        logger.info(f"{key}: {value.shape} - {value.dtype}")
    else:
        logger.info(f"{key}: {value}")

In [None]:
# バリデーション用の1日分のデータを準備
validation_date = '20250601'
logger.info(f"Preparing validation data for {validation_date}...")

# バリデーション用データセットを作成
validation_dataset = Horgues3Dataset(
    start_ymd=validation_date,
    end_ymd=validation_date,
    max_horses=18
)

# データを取得・前処理（既存の前処理器を使用）
validation_dataset.fetch_data().set_preprocessors(dataset.get_preprocessors()).preprocess_data().build_race_data()

# バリデーション用データローダー
validation_loader = DataLoader(
    validation_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
logger.info(f"Using device: {device}")

feature_config = dataset.get_feature_config()

# モデル、損失関数、オプティマイザーの初期化
model = Horgues3Model(**feature_config).to(device)
criterion = PlackettLuceLoss(temperature=1.0)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

# 学習パラメータ
num_epochs = 10
log_interval = 10

logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
logger.info(f"Training samples: {len(dataset):,}")
logger.info(f"Batch size: {train_loader.batch_size}")
logger.info(f"Batches per epoch: {len(train_loader)}")

In [None]:
# 学習ループ
total_batches = 0

for epoch in range(num_epochs):
    epoch_loss = 0.0
    num_batches = 0
    running_loss = 0.0  # ログ用の損失

    model.train()
    for batch_idx, batch in enumerate(train_loader):
        # データをデバイスに移動
        x_num = batch['x_num'].to(device)
        x_cat = batch['x_cat'].to(device)
        rankings = batch['rankings'].to(device)
        mask = batch['mask'].to(device)

        # 勾配をゼロに初期化
        optimizer.zero_grad()

        # 順伝播
        scores = model(x_num, x_cat, mask)

        # 損失計算
        loss = criterion(scores, rankings, mask)

        # 逆伝播
        loss.backward()

        # 勾配クリッピング適用
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # パラメータ更新
        optimizer.step()

        # 損失の記録
        epoch_loss += loss.item()
        running_loss += loss.item()
        num_batches += 1
        total_batches += 1

        # ログ出力
        if (batch_idx + 1) % log_interval == 0:
            avg_loss = running_loss / log_interval
            logger.info(f'Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx + 1}/{len(train_loader)}, Avg Loss: {avg_loss:.4f}')
            running_loss = 0.0

    # エポック終了時の平均損失
    avg_epoch_loss = epoch_loss / num_batches
    logger.info(f'Epoch {epoch + 1}/{num_epochs} completed - Average Loss: {avg_epoch_loss:.4f}')

    # バリデーション評価
    logger.info(f"Running validation for epoch {epoch + 1}...")
    model.eval()

    validation_results = []
    validation_results.append("")
    validation_results.append(f"\n{'='*80}")
    validation_results.append(f"VALIDATION RESULTS - EPOCH {epoch + 1}")
    validation_results.append(f"{'='*80}")

    with torch.no_grad():
        for val_batch_idx, val_batch in enumerate(validation_loader):
            # データをデバイスに移動
            x_num = val_batch['x_num'].to(device)
            x_cat = val_batch['x_cat'].to(device)
            mask = val_batch['mask'].to(device)
            
            # 予測
            scores = model(x_num, x_cat, mask)

            # DEBUG: Print scores for the first race in the first validation batch
            if val_batch_idx == 0 and epoch == 0 and scores.shape[0] > 0: # Log only once for brevity
                first_race_scores_all = scores[0].cpu().detach()
                first_race_mask_bool = mask[0].cpu().detach().bool()
                valid_scores_first_race = first_race_scores_all[first_race_mask_bool]
                
                logger.info(f"Debug (Epoch {epoch + 1}): Raw scores for first race in batch (all): {first_race_scores_all}")
                logger.info(f"Debug (Epoch {epoch + 1}): Mask for first race: {first_race_mask_bool.int()}")
                if len(valid_scores_first_race) > 0:
                    logger.info(f"Debug (Epoch {epoch + 1}): Valid scores for first race in batch: {valid_scores_first_race}")
                    if len(valid_scores_first_race) > 1:
                        logger.info(f"Debug (Epoch {epoch + 1}): Std dev of valid scores: {torch.std(valid_scores_first_race)}")
                else:
                    logger.info(f"Debug (Epoch {epoch + 1}): No valid horses in first race based on mask.")

            # バッチ内の各レースについて処理
            batch_size = scores.shape[0]
            for i in range(batch_size):
                race_id = val_batch['race_id'][i]
                race_scores = scores[i].cpu().numpy()
                race_mask = mask[i].cpu().numpy()
                num_horses = val_batch['num_horses'][i].item()
                
                # 馬券確率を計算
                probabilities = calculate_betting_probabilities(
                    horse_strengths=race_scores.reshape(1, -1),
                    mask=race_mask.reshape(1, -1),
                    temperature=1.0,
                )
                
                # 各馬券種の確率を1次元に変換
                for bet_type in probabilities:
                    probabilities[bet_type] = probabilities[bet_type][0]
                
                # 結果をフォーマット
                result = format_betting_results(race_id, probabilities, num_horses)
                validation_results.append(result)
    
    validation_results.append(f"{'='*80}")
    validation_results.append("")

    # バリデーション結果を出力
    logger.info('\n'.join(validation_results))
    
logger.info("Training completed!")