In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import logging
import os
import sys

from foregues.dataset import ForeguesDataset
from foregues.models import ForeguesModel

# ロギング設定
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# デバイス設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用デバイス: {device}")

In [None]:
# データの読み込み
data = pd.read_csv(
    'data/histdata/HISTDATA_COM_ASCII_EURUSD_M1_2023/DAT_ASCII_EURUSD_M1_2023.csv',
    header=None,
    index_col='timestamp',
    delimiter=';',
    names=['timestamp', 'open', 'high', 'low', 'close', 'volume'],
    parse_dates=['timestamp'],
    date_format='%Y%m%d %H%M%S'
)

# 5分足にリサンプリング
resampled_data = data.resample('5min').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
}).dropna()

print(f"リサンプリング後のデータサイズ: {len(resampled_data)}")
print(f"データ期間: {resampled_data.index[0]} から {resampled_data.index[-1]}")

In [None]:
# パラメータ設定
TH1 = 0.0030  # 30 pips
TH2 = 0.0040  # 40 pips
SEQUENCE_LENGTH = 1440  # 5日間
PREDICTION_PERIOD = 288  # 24時間

print(f"閾値設定: th1={TH1} ({TH1*10000:.1f}pips), th2={TH2} ({TH2*10000:.1f}pips)")
print(f"シーケンス長: {SEQUENCE_LENGTH} (約{SEQUENCE_LENGTH//288:.1f}日)")
print(f"予測期間: {PREDICTION_PERIOD} (約{PREDICTION_PERIOD//12:.1f}時間)")

In [None]:
# データセットの作成
dataset = ForeguesDataset(
    data=resampled_data,
    th1=TH1,
    th2=TH2,
    sequence_length=SEQUENCE_LENGTH,
    prediction_period=PREDICTION_PERIOD
)

print(f"データセットサイズ: {len(dataset)}")

# ラベル分布の確認
labels = [dataset[i]['labels'] for i in range(len(dataset))]
label_counts = np.bincount(labels, minlength=3)
print(f"ラベル分布:")
print(f"  クラス0 (何もしない): {label_counts[0]} ({label_counts[0]/len(labels)*100:.1f}%)")
print(f"  クラス1 (買い): {label_counts[1]} ({label_counts[1]/len(labels)*100:.1f}%)")
print(f"  クラス2 (売り): {label_counts[2]} ({label_counts[2]/len(labels)*100:.1f}%)")

In [None]:
# モデルパラメータの設定
model_params = {
    'sequence_names': ['price_history'],
    'feature_aliases': {},  # 特徴量のエイリアス設定
    'numerical_features': dataset.numerical_features,
    'categorical_features': dataset.get_categorical_vocab_sizes(),
    'num_classes': 3,
    'd_token': 192,
    'num_bins': 8,
    'binning_temperature': 0.8,
    'binning_init_range': 2.5,
    'ft_n_layers': 3,
    'ft_n_heads': 8,
    'seq_n_layers': 3,
    'seq_n_heads': 8,
    'dropout': 0.1
}

print("モデル設定:")
for key, value in model_params.items():
    if key not in ['numerical_features', 'categorical_features']:
        print(f"  {key}: {value}")
print(f"  数値特徴量数: {len(model_params['numerical_features'])}")
print(f"  カテゴリ特徴量数: {len(model_params['categorical_features'])}")

In [None]:
# データセットの分割 (訓練:検証:テスト = 70:15:15)
total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

print(f"データ分割:")
print(f"  訓練データ: {len(train_dataset)}")
print(f"  検証データ: {len(val_dataset)}")
print(f"  テストデータ: {len(test_dataset)}")

In [None]:
# データローダーの作成
BATCH_SIZE = 16
NUM_WORKERS = 2

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=True
)

val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=True
)

test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=True
)

print("データローダー作成完了")

In [None]:
# モデルの作成
model = ForeguesModel(**model_params)
model = model.to(device)

# モデルサイズの確認
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"モデル作成完了")
print(f"総パラメータ数: {total_params:,}")
print(f"訓練可能パラメータ数: {trainable_params:,}")

In [None]:
# 損失関数とオプティマイザーの設定
# クラス不均衡を考慮した重み付き交差エントロピー損失
class_weights = torch.tensor([1.0, 2.0, 2.0], dtype=torch.float32).to(device)  # クラス0を軽く、1,2を重く
criterion = nn.CrossEntropyLoss(weight=class_weights)

# オプティマイザー
optimizer = AdamW(
    model.parameters(),
    lr=1e-4,
    weight_decay=1e-5,
    betas=(0.9, 0.999)
)

# 学習率スケジューラー
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)

print("損失関数・オプティマイザー設定完了")

In [None]:
# 訓練ループ
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        x_num = batch['x_num']
        x_cat = batch['x_cat'] 
        sequence_data = batch['sequence_data']
        labels = batch['labels'].to(device)
        
        # GPU転送
        if x_num:
            x_num = {k: v.to(device) for k, v in x_num.items()}
        if x_cat:
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
        if sequence_data:
            for seq_name in sequence_data:
                if 'x_num' in sequence_data[seq_name]:
                    sequence_data[seq_name]['x_num'] = {
                        k: v.to(device) for k, v in sequence_data[seq_name]['x_num'].items()
                    }
                if 'x_cat' in sequence_data[seq_name]:
                    sequence_data[seq_name]['x_cat'] = {
                        k: v.to(device) for k, v in sequence_data[seq_name]['x_cat'].items()
                    }
                if 'mask' in sequence_data[seq_name]:
                    sequence_data[seq_name]['mask'] = sequence_data[seq_name]['mask'].to(device)
        
        optimizer.zero_grad()
        
        # 前向き計算
        outputs = model(x_num=x_num, x_cat=x_cat, sequence_data=sequence_data)
        loss = criterion(outputs, labels)
        
        # 後向き計算
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        # 統計情報の更新
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # プログレスバー更新
        progress_bar.set_postfix({
            'Loss': f'{loss.item():.4f}',
            'Acc': f'{100.*correct/total:.2f}%'
        })
    
    return total_loss / len(dataloader), 100. * correct / total

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            x_num = batch['x_num']
            x_cat = batch['x_cat']
            sequence_data = batch['sequence_data']
            labels = batch['labels'].to(device)
            
            # GPU転送
            if x_num:
                x_num = {k: v.to(device) for k, v in x_num.items()}
            if x_cat:
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
            if sequence_data:
                for seq_name in sequence_data:
                    if 'x_num' in sequence_data[seq_name]:
                        sequence_data[seq_name]['x_num'] = {
                            k: v.to(device) for k, v in sequence_data[seq_name]['x_num'].items()
                        }
                    if 'x_cat' in sequence_data[seq_name]:
                        sequence_data[seq_name]['x_cat'] = {
                            k: v.to(device) for k, v in sequence_data[seq_name]['x_cat'].items()
                        }
                    if 'mask' in sequence_data[seq_name]:
                        sequence_data[seq_name]['mask'] = sequence_data[seq_name]['mask'].to(device)
            
            outputs = model(x_num=x_num, x_cat=x_cat, sequence_data=sequence_data)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(dataloader), 100. * correct / total, all_predictions, all_labels

print("訓練関数定義完了")

In [None]:
# 訓練実行
NUM_EPOCHS = 20
best_val_loss = float('inf')
train_losses = []
val_losses = []
train_accs = []
val_accs = []

print("訓練開始...")

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-" * 50)
    
    # 訓練
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # 検証
    val_loss, val_acc, val_predictions, val_labels = validate_epoch(model, val_loader, criterion, device)
    
    # 学習率スケジューラー更新
    scheduler.step(val_loss)
    
    # 結果記録
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
    
    # ベストモデル保存
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_acc': val_acc,
            'model_params': model_params,
            'preprocessing_params': dataset.get_preprocessing_params()
        }, '../models/best_model.pt')
        print(f"新しいベストモデルを保存 (Val Loss: {val_loss:.4f})")

print("訓練完了!")

In [None]:
# 学習曲線の可視化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# 損失
ax1.plot(train_losses, label='Train Loss', color='blue')
ax1.plot(val_losses, label='Validation Loss', color='red')
ax1.set_title('Training and Validation Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True)

# 精度
ax2.plot(train_accs, label='Train Accuracy', color='blue')
ax2.plot(val_accs, label='Validation Accuracy', color='red')
ax2.set_title('Training and Validation Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# テストデータでの最終評価
print("テストデータでの評価...")

# ベストモデルの読み込み
checkpoint = torch.load('../models/best_model.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

# テスト
test_loss, test_acc, test_predictions, test_labels = validate_epoch(model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

# 詳細な分類報告
print("\n分類レポート:")
print(classification_report(test_labels, test_predictions, 
                          target_names=['何もしない', '買い', '売り']))

# 混同行列
cm = confusion_matrix(test_labels, test_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['何もしない', '買い', '売り'],
            yticklabels=['何もしない', '買い', '売り'])
plt.title('混同行列 (テストデータ)')
plt.ylabel('実際のクラス')
plt.xlabel('予測クラス')
plt.show()

In [None]:
# クラス別の予測性能分析
class_names = ['何もしない', '買い', '売り']
class_predictions = {i: [] for i in range(3)}
class_true_labels = {i: [] for i in range(3)}

for true_label, pred_label in zip(test_labels, test_predictions):
    class_predictions[true_label].append(pred_label)
    class_true_labels[true_label].append(true_label)

print("クラス別詳細分析:")
for i, class_name in enumerate(class_names):
    if len(class_predictions[i]) > 0:
        class_acc = sum(1 for t, p in zip(class_true_labels[i], class_predictions[i]) if t == p) / len(class_predictions[i])
        print(f"{class_name}: {len(class_predictions[i])} サンプル, 精度: {class_acc*100:.1f}%")
    else:
        print(f"{class_name}: 0 サンプル")

In [None]:
# モデルの最終保存
final_model_path = '../models/foregues_model_final.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_params': model_params,
    'preprocessing_params': dataset.get_preprocessing_params(),
    'training_params': {
        'th1': TH1,
        'th2': TH2,
        'sequence_length': SEQUENCE_LENGTH,
        'prediction_period': PREDICTION_PERIOD,
        'batch_size': BATCH_SIZE,
        'num_epochs': NUM_EPOCHS
    },
    'test_results': {
        'test_loss': test_loss,
        'test_accuracy': test_acc,
        'classification_report': classification_report(test_labels, test_predictions, 
                                                     target_names=class_names, output_dict=True)
    }
}, final_model_path)

print(f"最終モデルを保存: {final_model_path}")
print("訓練とテストが完了しました！")