In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls -al /content/drive/MyDrive/IS_Safem0de

total 1565
drwx------ 2 root root    4096 Nov  3 04:15 colab
drwx------ 2 root root    4096 Nov  3 03:28 data
-rw------- 1 root root   74953 Nov  4 14:01 dqn_fx.pt
-rw------- 1 root root   76397 Nov  4 15:18 dqn_fx_v2.pt
-rw------- 1 root root 1433206 Oct 16 03:59 EURUSD_H4.csv
drwx------ 2 root root    4096 Nov  3 02:47 paper
drwx------ 2 root root    4096 Nov  3 02:48 prelim_midterm


In [None]:
data_path = '/content/drive/MyDrive/IS_Safem0de/EURUSD_H4.csv'
import pandas as pd

df = pd.read_csv(data_path)
df.tail()

Unnamed: 0,2009-10-05 20:00,1.46469,1.46609,1.46436,1.46558,2247
25842,2025-10-15 08:00,1.16355,1.16453,1.16195,1.16199,16243
25843,2025-10-15 12:00,1.16201,1.16472,1.16103,1.16369,25939
25844,2025-10-15 16:00,1.16371,1.16448,1.16269,1.16441,14745
25845,2025-10-15 20:00,1.16442,1.16519,1.16418,1.16456,5554
25846,2025-10-16 00:00,1.16455,1.16754,1.16425,1.16659,14450


In [None]:
import numpy as np
import pandas as pd

def ensure_datetime_index(df):
    d = df.copy()
    if 'datetime' in d.columns:
        d['datetime'] = pd.to_datetime(d['datetime'], errors='coerce')
        d = d.set_index('datetime')
    if not isinstance(d.index, pd.DatetimeIndex):
        d.index = pd.to_datetime(d.index, errors='coerce')
    d = d[~d.index.isna()].sort_index()
    d = d[~d.index.duplicated(keep='first')]
    return d

In [None]:
# ตรวจชื่อคอลัมน์
df.columns = ['datetime', 'open', 'high', 'low', 'close', 'volume']

# แปลง datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime')

# ตั้ง index เป็นเวลา (สะดวกกับการ plot)
df.set_index('datetime', inplace=True)

# 1) ทำให้ index เป็น Datetime ที่เรียงและไม่ซ้ำ
df = ensure_datetime_index(df)

# 2) TDD checks เบื้องต้น
assert isinstance(df.index, pd.DatetimeIndex)
assert df.index.is_monotonic_increasing
assert not df.index.has_duplicates
assert {'open','high','low','close'}.issubset(df.columns)

print(df.index.min(), '→', df.index.max(), 'rows=', len(df))
display(df.head(3)); display(df.tail(3))

2009-10-06 00:00:00 → 2025-10-16 00:00:00 rows= 25847


Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-10-06 00:00:00,1.46563,1.47178,1.46541,1.47035,10300
2009-10-06 04:00:00,1.47051,1.47486,1.46951,1.47169,12566
2009-10-06 08:00:00,1.47176,1.47458,1.47068,1.47264,18816


Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-15 16:00:00,1.16371,1.16448,1.16269,1.16441,14745
2025-10-15 20:00:00,1.16442,1.16519,1.16418,1.16456,5554
2025-10-16 00:00:00,1.16455,1.16754,1.16425,1.16659,14450


In [None]:
# ===== STEP 1: Data QA + Features + Time Split (no leakage) =====
import numpy as np
import pandas as pd

# --- 1) Quick sanity checks (OHLC invariants & gaps) ---
def sanity_checks(d: pd.DataFrame):
    # OHLC must obey: low <= [open,close,high] <= high
    bad_low  = (d['low']  > d[['open','close','high']].min(axis=1)).sum()
    bad_high = (d['high'] < d[['open','close','low'] ].max(axis=1)).sum()
    assert bad_low == 0 and bad_high == 0, f"OHLC invariant broken (low>min or high<max): low_bad={bad_low}, high_bad={bad_high}"

    # Volume non-negative (บาง broker อาจให้ 0)
    assert (d['volume'] >= 0).all(), "Found negative volume."

    # ดู pattern ของช่วงเวลา (H4 ควรเจอ 4 ชั่วโมงเป็นหลัก แต่มี gap เสาร์-อาทิตย์)
    diffs_h = d.index.to_series().diff().dropna().dt.total_seconds().div(3600)
    print("Top time steps (hours):")
    print(diffs_h.round(2).value_counts().head(5))

sanity_checks(df)

# --- 2) Minimal, robust features (หลีกเลี่ยง leakage) ---
def rsi(close: pd.Series, period: int = 14) -> pd.Series:
    delta = close.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    roll_up = up.ewm(alpha=1/period, adjust=False).mean()
    roll_down = down.ewm(alpha=1/period, adjust=False).mean()
    rs = roll_up / (roll_down + 1e-12)
    return 100 - (100 / (1 + rs))

def atr(df_: pd.DataFrame, period: int = 14) -> pd.Series:
    hl = df_['high'] - df_['low']
    hc = (df_['high'] - df_['close'].shift()).abs()
    lc = (df_['low']  - df_['close'].shift()).abs()
    tr = pd.concat([hl, hc, lc], axis=1).max(axis=1)
    return tr.ewm(alpha=1/period, adjust=False).mean()

def make_features(d: pd.DataFrame) -> pd.DataFrame:
    out = d.copy()

    # ใช้ log return เพื่อความนิ่งทางสถิติ
    out['logret_1'] = np.log(out['close']).diff(1)

    # โมเมนตัมหลายสเกล (เป้าหมายให้ state มองเห็นทิศที่ผ่านมา)
    out['mom_1'] = out['close'].pct_change(1)
    out['mom_3'] = out['close'].pct_change(3)
    out['mom_6'] = out['close'].pct_change(6)

    # ความผันผวนโดยประมาณ (EW std ของ log return)
    out['vol_24'] = out['logret_1'].rolling(24, min_periods=24).std()  # ~4 วันของ H4
    # ATR ช่วยสะท้อน range ที่แท้จริง
    out['atr_14'] = atr(out, period=14)

    # RSI คลาสสิก
    out['rsi_14'] = rsi(out['close'], period=14)

    # สองเส้นค่าเฉลี่ยและส่วนต่าง (หลีกเลี่ยง cross label leakage โดยใช้ rolling)
    out['sma_10'] = out['close'].rolling(10, min_periods=10).mean()
    out['sma_50'] = out['close'].rolling(50, min_periods=50).mean()
    out['sma_diff'] = out['sma_10'] - out['sma_50']

    # ลบแถวแรกๆ ที่ยังคำนวณไม่ครบ
    out = out.dropna().copy()
    return out

feat_df = make_features(df)

# TDD checks เบื้องต้น
assert {'open','high','low','close','volume'}.issubset(df.columns)
assert feat_df.index.is_monotonic_increasing and not feat_df.index.has_duplicates
for col in ['logret_1','mom_1','mom_3','mom_6','vol_24','atr_14','rsi_14','sma_10','sma_50','sma_diff']:
    assert col in feat_df.columns and not feat_df[col].isna().any()

print("Feature rows:", len(feat_df))
display(feat_df.tail(3))

# --- 3) Time-based split: train/valid/test (70/15/15) ---
def time_split(d: pd.DataFrame, train_ratio=0.70, valid_ratio=0.15):
    n = len(d)
    i1 = int(n * train_ratio)
    i2 = int(n * (train_ratio + valid_ratio))
    train = d.iloc[:i1].copy()
    valid = d.iloc[i1:i2].copy()
    test  = d.iloc[i2:].copy()
    return train, valid, test

train_df, valid_df, test_df = time_split(feat_df, 0.70, 0.15)
print("Split sizes →", len(train_df), len(valid_df), len(test_df))
print("Date ranges:")
print("  train:", train_df.index.min(), "→", train_df.index.max())
print("  valid:", valid_df.index.min(), "→", valid_df.index.max())
print("  test :", test_df.index.min(),  "→", test_df.index.max())

# เก็บรายการชื่อฟีเจอร์ที่จะส่งให้เอเจนต์ RL (ไม่รวมคอลัมน์ราคา)
STATE_FEATS = ['mom_1','mom_3','mom_6','vol_24','atr_14','rsi_14','sma_diff','logret_1']

# จะ normalize ทีหลัง โดย fit เฉพาะบน train เพื่อกัน leakage
print("STATE_FEATS:", STATE_FEATS)

Top time steps (hours):
datetime
4.0     24997
48.0      830
16.0        7
24.0        5
72.0        4
Name: count, dtype: int64
Feature rows: 25798


Unnamed: 0_level_0,open,high,low,close,volume,logret_1,mom_1,mom_3,mom_6,vol_24,atr_14,rsi_14,sma_10,sma_50,sma_diff
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2025-10-15 16:00:00,1.16371,1.16448,1.16269,1.16441,14745,0.000619,0.000619,0.000731,0.003343,0.001803,0.002734,58.954438,1.160755,1.162501,-0.001746
2025-10-15 20:00:00,1.16442,1.16519,1.16418,1.16456,5554,0.000129,0.000129,0.002212,0.003654,0.001802,0.002611,59.273231,1.161716,1.162316,-0.0006
2025-10-16 00:00:00,1.16455,1.16754,1.16425,1.16659,14450,0.001742,0.001743,0.002492,0.003769,0.001827,0.002659,63.414557,1.162857,1.162224,0.000633


Split sizes → 18058 3870 3870
Date ranges:
  train: 2009-10-16 00:00:00 → 2020-12-28 20:00:00
  valid: 2020-12-29 00:00:00 → 2023-05-23 00:00:00
  test : 2023-05-23 04:00:00 → 2025-10-16 00:00:00
STATE_FEATS: ['mom_1', 'mom_3', 'mom_6', 'vol_24', 'atr_14', 'rsi_14', 'sma_diff', 'logret_1']


In [None]:
# ===== STEP 2: Scaler (fit บน train เท่านั้น) + Gym-like TradingEnv =====
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Tuple
from sklearn.preprocessing import StandardScaler

# --- 2.1 Fit/Transform แบบกัน leakage ---
scaler = StandardScaler().fit(train_df[STATE_FEATS])
X_train = scaler.transform(train_df[STATE_FEATS])
X_valid = scaler.transform(valid_df[STATE_FEATS])
X_test  = scaler.transform(test_df[STATE_FEATS])

P_train = train_df['close'].to_numpy()
P_valid = valid_df['close'].to_numpy()
P_test  = test_df['close'].to_numpy()

R_train = train_df['logret_1'].to_numpy()
R_valid = valid_df['logret_1'].to_numpy()
R_test  = test_df['logret_1'].to_numpy()

# TDD checks
assert X_train.shape[0] == len(P_train) == len(R_train)
assert X_valid.shape[0] == len(P_valid) == len(R_valid)
assert X_test.shape[0]  == len(P_test)  == len(R_test)
assert X_train.shape[1] == len(STATE_FEATS)

print("Shapes ->",
      "train", X_train.shape, "| valid", X_valid.shape, "| test", X_test.shape)

# --- 2.2 Trading environment (gym-like, ไม่พึ่งพา gym) ---
# Actions: 0=SHORT (-1), 1=FLAT (0), 2=LONG (+1)
# Reward_t = pos_{t-1} * logret_t - trade_cost_t
# Trade cost: คิด half-spread เมื่อเข้า/ออก และ full-spread เมื่อ flip ทันที (exit+enter)

@dataclass
class TradingEnvConfig:
    spread_pips: float = 1.5      # EURUSD เฉลี่ย 0.8–1.5 pips; ใช้ 1.5 pips แบบ conservative
    pip_size: float = 1e-4        # EURUSD pip = 0.0001
    max_steps: int = None         # ไม่กำหนด = วิ่งจนจบชุดข้อมูล
    reward_clip: float = None     # เช่น 0.01 เพื่อลด outlier; None = ไม่ clip
    seed: int = 42

class TradingEnv:
    def __init__(self, X: np.ndarray, prices: np.ndarray, logrets: np.ndarray, cfg: TradingEnvConfig):
        assert len(X) == len(prices) == len(logrets)
        self.X = X
        self.prices = prices
        self.rets = logrets
        self.cfg = cfg
        self.rng = np.random.default_rng(cfg.seed)
        self.n = len(X)
        self.n_features = X.shape[1]
        self.reset()

    def _trade_cost_frac(self, old_pos: int, new_pos: int, price: float) -> float:
        # cost (fraction of price) ≈ spread/price. ใช้กฎ: enter/exit = half-spread, flip = full-spread
        if new_pos == old_pos:
            return 0.0
        half = 0.5 * (self.cfg.spread_pips * self.cfg.pip_size) / max(price, 1e-12)
        if old_pos == 0 or new_pos == 0:
            return half
        else:
            return 2 * half  # flip (ออก+เข้า)

    def reset(self) -> np.ndarray:
        self.t = 0
        self.pos = 0               # เริ่มกลาง (FLAT)
        self.equity = 0.0
        self.done = False
        # คืน observation แรก
        return self.X[self.t].astype(np.float32)

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, dict]:
        if self.done:
            raise RuntimeError("Episode already done. Call reset().")
        assert action in (0,1,2)
        new_pos = action - 1       # map -> {-1,0,1}

        # ค่าคอมมิชชัน/สเปรด ณ เวลาตัดสินใจ (คิดเป็นสัดส่วนของราคา)
        cost_frac = self._trade_cost_frac(self.pos, new_pos, self.prices[self.t])

        # กำไรขาดทุนของช่วง t = pos_{t} * ret_{t+1?}
        # ที่นี่เราให้ reward ใช้ ret ของแท่ง 'ปัจจุบัน' (close-to-close) กับ pos ก่อนหน้า
        # หมายเหตุ: แท่งแรก pos=0 จึง reward=0 ตามธรรมชาติ
        reward = (self.pos * self.rets[self.t]) - cost_frac

        # optional clip
        if self.cfg.reward_clip is not None:
            reward = float(np.clip(reward, -self.cfg.reward_clip, self.cfg.reward_clip))

        # อัพเดทสถานะ
        self.pos = new_pos
        self.equity += reward
        self.t += 1

        # จบเมื่อหมดข้อมูลหรือถึง max_steps
        if self.t >= self.n - 1:
            self.done = True
        if self.cfg.max_steps is not None and self.t >= self.cfg.max_steps:
            self.done = True

        obs = self.X[self.t].astype(np.float32) if not self.done else self.X[-1].astype(np.float32)
        info = {"t": self.t, "pos": self.pos, "equity": self.equity, "price": self.prices[self.t-1]}
        return obs, float(reward), self.done, info

    def render(self):
        print(f"t={self.t} pos={self.pos:+d} equity={self.equity:+.6f} price={self.prices[self.t-1]:.5f}")

# --- 2.3 สร้าง env สำหรับแต่ละ split + smoke test ---
cfg = TradingEnvConfig(spread_pips=1.5, pip_size=1e-4, reward_clip=None, seed=123)

env_train = TradingEnv(X_train, P_train, R_train, cfg)
env_valid = TradingEnv(X_valid, P_valid, R_valid, cfg)
env_test  = TradingEnv(X_test,  P_test,  R_test,  cfg)

# Quick smoke test: random policy 200 steps
obs = env_train.reset()
total_r = 0.0
for _ in range(200):
    a = np.random.randint(0, 3)  # 0=short,1=flat,2=long
    obs, r, done, info = env_train.step(a)
    total_r += r
    if done: break

print("Random policy 200 steps → reward:", total_r, "| t:", info["t"], "| equity:", info["equity"])
env_train.render()

# TDD checks: consistency
assert env_train.n_features == len(STATE_FEATS)
assert hasattr(env_train, "step") and hasattr(env_train, "reset")


Shapes -> train (18058, 8) | valid (3870, 8) | test (3870, 8)
Random policy 200 steps → reward: -0.0044266552898062665 | t: 200 | equity: -0.0044266552898062665
t=200 pos=+0 equity=-0.004427 price=1.50134


In [None]:
# ===== STEP 3: DQN Agent (PyTorch) + Training + Validation Metrics =====
import math, random, copy, os
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# ----- 3.1 Utils & Repro -----
def seed_everything(seed=123):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
seed_everything(123)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----- 3.2 Q-Network -----
class QNetwork(nn.Module):
    def __init__(self, in_dim: int, n_actions: int = 3, hidden=(128, 128)):
        super().__init__()
        layers = []
        last = in_dim
        for h in hidden:
            layers += [nn.Linear(last, h), nn.ReLU()]
            last = h
        layers += [nn.Linear(last, n_actions)]
        self.net = nn.Sequential(*layers)
        self.apply(self._init)

    @staticmethod
    def _init(m):
        if isinstance(m, nn.Linear):
            nn.init.orthogonal_(m.weight, gain=math.sqrt(2))
            nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        return self.net(x)

# ----- 3.3 Replay Buffer -----
class ReplayBuffer:
    def __init__(self, capacity: int, obs_dim: int):
        self.capacity = capacity
        self.obs_dim = obs_dim
        self.ptr = 0
        self.full = False
        self.s = np.zeros((capacity, obs_dim), dtype=np.float32)
        self.a = np.zeros((capacity, 1), dtype=np.int64)
        self.r = np.zeros((capacity, 1), dtype=np.float32)
        self.ns = np.zeros((capacity, obs_dim), dtype=np.float32)
        self.d = np.zeros((capacity, 1), dtype=np.float32)

    def add(self, s, a, r, ns, d):
        i = self.ptr
        self.s[i] = s
        self.a[i] = a
        self.r[i] = r
        self.ns[i] = ns
        self.d[i] = d
        self.ptr = (self.ptr + 1) % self.capacity
        if self.ptr == 0:
            self.full = True

    def __len__(self):
        return self.capacity if self.full else self.ptr

    def sample(self, batch_size: int):
        idx = np.random.randint(0, len(self), size=batch_size)
        s  = torch.as_tensor(self.s[idx],  device=device)
        a  = torch.as_tensor(self.a[idx],  device=device)
        r  = torch.as_tensor(self.r[idx],  device=device)
        ns = torch.as_tensor(self.ns[idx], device=device)
        d  = torch.as_tensor(self.d[idx],  device=device)
        return s, a, r, ns, d

# ----- 3.4 Config -----
@dataclass
class DQNConfig:
    gamma: float = 0.99
    lr: float = 1e-3
    batch_size: int = 128
    buffer_capacity: int = 200_000
    start_learning: int = 2_000
    target_sync: int = 1_000
    eps_start: float = 1.0
    eps_end: float = 0.05
    eps_decay_steps: int = 50_000
    double_dqn: bool = True
    grad_clip: float = 1.0
    episodes: int = 6          # วิ่งหลายรอบบนชุด train เดิม (POC)
    patience: int = 3          # early stopping บน valid Sharpe
    model_path: str = "/content/drive/MyDrive/IS_Safem0de/dqn_fx.pt"

cfg_dqn = DQNConfig()

# ----- 3.5 Agent -----
class DQNAgent:
    def __init__(self, obs_dim, n_actions=3, cfg: DQNConfig = cfg_dqn):
        self.cfg = cfg
        self.n_actions = n_actions
        self.q = QNetwork(obs_dim, n_actions=n_actions).to(device)
        self.targ = copy.deepcopy(self.q).to(device)
        self.opt = optim.Adam(self.q.parameters(), lr=cfg.lr)
        self.buf = ReplayBuffer(cfg.buffer_capacity, obs_dim)
        self.step_count = 0

    def epsilon(self):
        # Linear decay
        frac = min(1.0, self.step_count / max(1, self.cfg.eps_decay_steps))
        return max(self.cfg.eps_end, self.cfg.eps_start * (1.0 - frac))

    @torch.no_grad()
    def act(self, obs: np.ndarray, greedy: bool = False) -> int:
        if (not greedy) and (np.random.rand() < self.epsilon()):
            return np.random.randint(0, self.n_actions)
        x = torch.from_numpy(obs).float().unsqueeze(0).to(device)
        q = self.q(x)
        return int(torch.argmax(q, dim=1).item())

    def update(self):
        if len(self.buf) < self.cfg.start_learning:
            return None

        s, a, r, ns, d = self.buf.sample(self.cfg.batch_size)

        q_pred = self.q(s).gather(1, a)  # (B,1)

        with torch.no_grad():
            if self.cfg.double_dqn:
                # Double DQN: argmax by online, value by target
                next_a = torch.argmax(self.q(ns), dim=1, keepdim=True)
                next_q = self.targ(ns).gather(1, next_a).squeeze(1)
            else:
                next_q = torch.max(self.targ(ns), dim=1).values
            target = r.squeeze(1) + self.cfg.gamma * (1 - d.squeeze(1)) * next_q

        loss = nn.SmoothL1Loss()(q_pred.squeeze(1), target)

        self.opt.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.q.parameters(), self.cfg.grad_clip)
        self.opt.step()

        if self.step_count % self.cfg.target_sync == 0:
            self.targ.load_state_dict(self.q.state_dict())

        return float(loss.item())

# ----- 3.6 Training loop -----
def steps_per_year_h4():
    return int((24/4) * 252)  # ≈ 1512

def eval_agent(env, agent: DQNAgent):
    obs = env.reset()
    done = False
    rewards = []
    actions = []
    while not done:
        a = agent.act(obs, greedy=True)
        obs, r, done, info = env.step(a)
        rewards.append(r)
        actions.append(a)

    r = np.array(rewards, dtype=np.float64)
    eps = 1e-12
    mean_r = float(np.mean(r))
    std_r = float(np.std(r) + eps)
    ann_factor = math.sqrt(steps_per_year_h4())
    sharpe = float((mean_r / std_r) * ann_factor) if std_r > 0 else 0.0
    total_log = float(np.sum(r))
    total_return = float(np.exp(total_log) - 1.0)

    cum = np.cumsum(r)
    eq = np.exp(cum)  # เริ่มต้น 1.0
    peak = np.maximum.accumulate(eq)
    dd = float(np.min(eq/peak - 1.0))
    hit = float(np.mean(r > 0.0))
    trades = int(np.sum(np.array(actions[1:]) != np.array(actions[:-1])))

    return {
        "steps": len(r),
        "mean_reward": mean_r,
        "sharpe": sharpe,
        "total_return_pct": 100*total_return,
        "max_drawdown_pct": 100*dd,
        "hit_rate": hit,
        "trades": trades
    }

def train_dqn(env_train, env_valid, agent: DQNAgent, cfg: DQNConfig):
    best_sharpe = -1e9
    best_state = None
    patience_left = cfg.patience

    for ep in range(1, cfg.episodes+1):
        obs = env_train.reset()
        done = False
        ep_loss = []
        ep_reward = 0.0

        while not done:
            a = agent.act(obs, greedy=False)
            obs2, r, done, info = env_train.step(a)

            agent.buf.add(obs.astype(np.float32), a, float(r), obs2.astype(np.float32), float(done))
            agent.step_count += 1

            loss = agent.update()
            if loss is not None and not math.isnan(loss):
                ep_loss.append(loss)

            obs = obs2
            ep_reward += r

        # === End of episode: evaluate on VALID ===
        met = eval_agent(env_valid, agent)
        print(f"[Ep {ep}] train_reward={ep_reward:.4f} | valid_sharpe={met['sharpe']:.3f} | "
              f"valid_ret={met['total_return_pct']:.2f}% | trades={met['trades']} | "
              f"mean_loss={(np.mean(ep_loss) if ep_loss else float('nan')):.5f}")

        # Early stopping on best Sharpe
        if met["sharpe"] > best_sharpe + 1e-6:
            best_sharpe = met["sharpe"]
            best_state = copy.deepcopy(agent.q.state_dict())
            patience_left = cfg.patience
        else:
            patience_left -= 1
            if patience_left <= 0:
                print("Early stopping (no better Sharpe on validation).")
                break

    if best_state is not None:
        agent.q.load_state_dict(best_state)
        agent.targ.load_state_dict(best_state)

    # Save model (optional)
    try:
        os.makedirs(os.path.dirname(cfg.model_path), exist_ok=True)
        torch.save(agent.q.state_dict(), cfg.model_path)
        print("Saved:", cfg.model_path)
    except Exception as e:
        print("Skip saving:", e)

    return agent

# ----- 3.7 Wire it up & GO -----
obs_dim = len(STATE_FEATS)
agent = DQNAgent(obs_dim, n_actions=3, cfg=cfg_dqn)

# Sanity TDD
assert env_train.n_features == obs_dim
assert hasattr(agent, "act") and hasattr(agent, "update")

agent = train_dqn(env_train, env_valid, agent, cfg_dqn)

# ----- 3.8 Final evaluation on VALID + TEST -----
val_metrics = eval_agent(env_valid, agent)
test_metrics = eval_agent(env_test, agent)

def pretty(m):
    return (
        f"steps={m['steps']}, sharpe={m['sharpe']:.3f}, "
        f"ret={m['total_return_pct']:.2f}%, mdd={m['max_drawdown_pct']:.2f}%, "
        f"hit_rate={100*m['hit_rate']:.1f}%, trades={m['trades']}"
    )

print("VALID  :", pretty(val_metrics))
print("TEST   :", pretty(test_metrics))

# TDD: ตัวเลขต้อง finite
for k,v in test_metrics.items():
    assert np.isfinite(v), f"Metric {k} is NaN/inf"


Device: cuda
[Ep 1] train_reward=-1.0059 | valid_sharpe=0.013 | valid_ret=0.25% | trades=1286 | mean_loss=0.00270
[Ep 2] train_reward=-0.8747 | valid_sharpe=-0.806 | valid_ret=-12.70% | trades=1155 | mean_loss=0.00018
[Ep 3] train_reward=-0.5153 | valid_sharpe=-0.043 | valid_ret=-0.77% | trades=728 | mean_loss=0.00012
[Ep 4] train_reward=-0.4404 | valid_sharpe=0.652 | valid_ret=14.22% | trades=311 | mean_loss=0.00004
[Ep 5] train_reward=-0.3689 | valid_sharpe=-0.184 | valid_ret=-3.19% | trades=831 | mean_loss=0.00002
[Ep 6] train_reward=-0.2904 | valid_sharpe=0.186 | valid_ret=3.78% | trades=999 | mean_loss=0.00001
Saved: /content/drive/MyDrive/IS_Safem0de/dqn_fx.pt
VALID  : steps=3869, sharpe=0.652, ret=14.22%, mdd=-14.03%, hit_rate=50.0%, trades=311
TEST   : steps=3869, sharpe=-0.264, ret=-4.56%, mdd=-14.24%, hit_rate=48.8%, trades=93


In [None]:
# ===== STEP 4: De-churn + Stability Patches =====
from dataclasses import dataclass
import copy, math, numpy as np, torch

# --- 4.1: ปรับ Environment ให้ลดการสลับถี่ ---
@dataclass
class TradingEnvConfig:
    spread_pips: float = 1.5      # เดิม
    pip_size: float = 1e-4
    reward_clip: float = 0.01     # กัน outlier นิดๆ
    min_hold: int = 2             # ต้องถืออย่างน้อย 2 แท่งก่อนเปลี่ยน
    churn_penalty_mult: float = 1.0 # เพิ่มโทษต่อการเปลี่ยน = 1*half-spread ต่อครั้ง
    seed: int = 42
    max_steps: int = None

class TradingEnv:
    def __init__(self, X, prices, logrets, cfg: TradingEnvConfig):
        assert len(X) == len(prices) == len(logrets)
        self.X, self.prices, self.rets, self.cfg = X, prices, logrets, cfg
        self.n, self.n_features = len(X), X.shape[1]
        self.reset()

    def _half_spread_frac(self, price: float) -> float:
        return 0.5 * (self.cfg.spread_pips * self.cfg.pip_size) / max(price, 1e-12)

    def _trade_cost_frac(self, old_pos: int, new_pos: int, price: float) -> float:
        if new_pos == old_pos:
            return 0.0
        half = self._half_spread_frac(price)
        base = half if (old_pos == 0 or new_pos == 0) else 2*half
        extra = self.cfg.churn_penalty_mult * half  # โทษเพิ่มเมื่อเปลี่ยน
        return base + extra

    def reset(self):
        self.t = 0
        self.pos = 0
        self.equity = 0.0
        self.hold_left = 0
        self.done = False
        return self.X[self.t].astype(np.float32)

    def step(self, action: int):
        if self.done:
            raise RuntimeError("Call reset() first")
        assert action in (0,1,2)
        want_pos = action - 1

        # บังคับ min_hold: ถ้ายังไม่ครบห้ามเปลี่ยน
        if self.hold_left > 0 and want_pos != self.pos:
            want_pos = self.pos

        price_t = self.prices[self.t]
        cost_frac = self._trade_cost_frac(self.pos, want_pos, price_t)

        # ใช้ผลตอบแทนของแท่งปัจจุบันกับสถานะก่อนหน้า (action มีผลแท่งถัดไป)
        reward = (self.pos * self.rets[self.t]) - cost_frac

        # clip
        if self.cfg.reward_clip is not None:
            reward = float(np.clip(reward, -self.cfg.reward_clip, self.cfg.reward_clip))

        # อัปเดตสถานะ
        changed = (want_pos != self.pos)
        self.pos = want_pos
        self.equity += reward
        self.t += 1

        # นับ hold ต่อเมื่อมีการเปลี่ยนจริง
        if changed:
            self.hold_left = self.cfg.min_hold
        else:
            self.hold_left = max(0, self.hold_left - 1)

        if self.t >= self.n - 1 or (self.cfg.max_steps and self.t >= self.cfg.max_steps):
            self.done = True

        obs = self.X[self.t].astype(np.float32) if not self.done else self.X[-1].astype(np.float32)
        info = {"t": self.t, "pos": self.pos, "equity": self.equity, "price": price_t}
        return obs, float(reward), self.done, info

# --- 4.2: re-instantiate envs with new config ---
cfg = TradingEnvConfig(spread_pips=1.5, reward_clip=0.01, min_hold=2, churn_penalty_mult=1.0, seed=123)
env_train = TradingEnv(X_train, P_train, R_train, cfg)
env_valid = TradingEnv(X_valid, P_valid, R_valid, cfg)
env_test  = TradingEnv(X_test,  P_test,  R_test,  cfg)

# --- 4.3: ปรับ Hyperparams DQN ให้เสถียรกว่า ---
from dataclasses import dataclass
import torch.nn as nn, torch.optim as optim

@dataclass
class DQNConfig:
    gamma: float = 0.99
    lr: float = 5e-4                 # ↓ ลง
    batch_size: int = 256            # ↑ ขึ้นเล็กน้อย
    buffer_capacity: int = 400_000   # ↑
    start_learning: int = 10_000     # ↑ เริ่มเรียนช้าลง
    target_sync: int = 2_000         # ↓ sync ถี่ขึ้นนิด
    eps_start: float = 1.0
    eps_end: float = 0.05
    eps_decay_steps: int = 150_000   # ↑ ยืดเวลา explore
    double_dqn: bool = True
    grad_clip: float = 1.0
    episodes: int = 12               # ↑
    patience: int = 4
    model_path: str = "/content/drive/MyDrive/IS_Safem0de/dqn_fx_v2.pt"

# ใช้ QNetwork/ReplayBuffer/Agent เดิม แต่เปลี่ยน optimizer ให้มี weight_decay
class DQNAgent:
    def __init__(self, obs_dim, n_actions=3, cfg: DQNConfig = DQNConfig()):
        self.cfg = cfg
        self.n_actions = n_actions
        self.q = QNetwork(obs_dim, n_actions=n_actions).to(device)
        self.targ = copy.deepcopy(self.q).to(device)
        self.opt = optim.Adam(self.q.parameters(), lr=cfg.lr, weight_decay=1e-5)  # + L2
        self.buf = ReplayBuffer(cfg.buffer_capacity, obs_dim)
        self.step_count = 0
    # ... (methods epsilon/act/update เหมือนเดิม) ...
    def epsilon(self):
        # Linear decay
        frac = min(1.0, self.step_count / max(1, self.cfg.eps_decay_steps))
        return max(self.cfg.eps_end, self.cfg.eps_start * (1.0 - frac))

    @torch.no_grad()
    def act(self, obs: np.ndarray, greedy: bool = False) -> int:
        if (not greedy) and (np.random.rand() < self.epsilon()):
            return np.random.randint(0, self.n_actions)
        x = torch.from_numpy(obs).float().unsqueeze(0).to(device)
        q = self.q(x)
        return int(torch.argmax(q, dim=1).item())

    def update(self):
        if len(self.buf) < self.cfg.start_learning:
            return None

        s, a, r, ns, d = self.buf.sample(self.cfg.batch_size)

        q_pred = self.q(s).gather(1, a)  # (B,1)

        with torch.no_grad():
            if self.cfg.double_dqn:
                # Double DQN: argmax by online, value by target
                next_a = torch.argmax(self.q(ns), dim=1, keepdim=True)
                next_q = self.targ(ns).gather(1, next_a).squeeze(1)
            else:
                next_q = torch.max(self.targ(ns), dim=1).values
            target = r.squeeze(1) + self.cfg.gamma * (1 - d.squeeze(1)) * next_q

        loss = nn.SmoothL1Loss()(q_pred.squeeze(1), target)

        self.opt.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.q.parameters(), self.cfg.grad_clip)
        self.opt.step()

        if self.step_count % self.cfg.target_sync == 0:
            self.targ.load_state_dict(self.q.state_dict())

        return float(loss.item())

# สร้าง/ฝึกใหม่
cfg_dqn = DQNConfig()
agent = DQNAgent(len(STATE_FEATS), n_actions=3, cfg=cfg_dqn)
agent = train_dqn(env_train, env_valid, agent, cfg_dqn)

# ประเมินผล
val_metrics = eval_agent(env_valid, agent)
test_metrics = eval_agent(env_test, agent)
print("VALID:", val_metrics)
print("TEST :", test_metrics)


[Ep 1] train_reward=-0.4734 | valid_sharpe=-1.090 | valid_ret=-17.24% | trades=1853 | mean_loss=0.00510
[Ep 2] train_reward=-0.7182 | valid_sharpe=-0.808 | valid_ret=-10.38% | trades=617 | mean_loss=0.00011
[Ep 3] train_reward=-1.1922 | valid_sharpe=-0.209 | valid_ret=-4.09% | trades=296 | mean_loss=0.00012
[Ep 4] train_reward=-0.6779 | valid_sharpe=1.464 | valid_ret=8.18% | trades=56 | mean_loss=0.00007
[Ep 5] train_reward=-0.0320 | valid_sharpe=0.000 | valid_ret=0.00% | trades=0 | mean_loss=0.00003
[Ep 6] train_reward=-0.9239 | valid_sharpe=-0.692 | valid_ret=-12.89% | trades=0 | mean_loss=0.00001
[Ep 7] train_reward=-0.5985 | valid_sharpe=0.691 | valid_ret=14.77% | trades=0 | mean_loss=0.00001
[Ep 8] train_reward=-0.0385 | valid_sharpe=0.691 | valid_ret=14.77% | trades=0 | mean_loss=0.00000
Early stopping (no better Sharpe on validation).
Saved: /content/drive/MyDrive/IS_Safem0de/dqn_fx_v2.pt
VALID: {'steps': 3869, 'mean_reward': 2.0319532908376244e-05, 'sharpe': 1.4644611788011885,

In [None]:
# --- Patch ENV: ส่งค่า last_cost ออกมาใน info ---
# (ถ้าคุณใช้ TradingEnv รุ่นล่าสุดอยู่แล้ว ให้แทนที่ method step() เฉพาะส่วนด้านล่างนี้)
def _env_step_with_cost(self, action: int):
    if self.done:
        raise RuntimeError("Call reset() first")
    assert action in (0,1,2)
    want_pos = action - 1
    if self.hold_left > 0 and want_pos != self.pos:
        want_pos = self.pos

    price_t = self.prices[self.t]
    cost_frac = self._trade_cost_frac(self.pos, want_pos, price_t)
    reward = (self.pos * self.rets[self.t]) - cost_frac

    if self.cfg.reward_clip is not None:
        reward = float(np.clip(reward, -self.cfg.reward_clip, self.cfg.reward_clip))

    changed = (want_pos != self.pos)
    self.pos = want_pos
    self.equity += reward
    self.t += 1

    if changed:
        self.hold_left = self.cfg.min_hold
    else:
        self.hold_left = max(0, self.hold_left - 1)

    if self.t >= self.n - 1 or (self.cfg.max_steps and self.t >= self.cfg.max_steps):
        self.done = True

    obs = self.X[self.t].astype(np.float32) if not self.done else self.X[-1].astype(np.float32)
    info = {"t": self.t, "pos": self.pos, "equity": self.equity,
            "price": price_t, "last_cost": cost_frac}
    return obs, float(reward), self.done, info

# bind method (ถ้า class เดิมชื่อ TradingEnv)
TradingEnv.step = _env_step_with_cost

# --- Eval ใหม่: นับ "เปลี่ยนสถานะจริง", ถือเฉลี่ยกี่บาร์, exposure, total_cost ---
import math, numpy as np

def steps_per_year_h4():
    return int((24/4) * 252)  # ≈ 1512

def _ann_sharpe_from_rewards(r):
    r = np.asarray(r, dtype=np.float64)
    mu, sd = r.mean(), r.std() + 1e-12
    return float((mu/sd) * math.sqrt(steps_per_year_h4())) if sd > 0 else 0.0

def _avg_hold_bars(pos_arr):
    runs = []
    run = 0
    prev = 0
    for p in pos_arr:
        if p != 0:
            if p == prev:
                run += 1
            else:
                if run > 0: runs.append(run)
                run = 1
        else:
            if run > 0: runs.append(run); run = 0
        prev = p
    if run > 0: runs.append(run)
    return float(np.mean(runs)) if runs else 0.0

def eval_agent(env, agent):
    obs = env.reset()
    done = False
    rewards, positions, costs = [], [], []
    while not done:
        a = agent.act(obs, greedy=True)
        obs, r, done, info = env.step(a)
        rewards.append(r)
        positions.append(info["pos"])
        costs.append(info.get("last_cost", 0.0))

    r = np.array(rewards, dtype=np.float64)
    eq = np.exp(np.cumsum(r))
    peak = np.maximum.accumulate(eq)
    dd = float(np.min(eq/peak - 1.0))
    total_return = float(np.exp(np.sum(r)) - 1.0)

    pos_arr = np.array(positions, dtype=int)
    pos_changes = (pos_arr[1:] != pos_arr[:-1])
    true_trades = int((1 if pos_arr[0] != 0 else 0) + np.sum(pos_changes))
    flips = int(np.sum((pos_arr[1:] * pos_arr[:-1]) == -1))  # short<->long
    exposure = float(np.mean(np.abs(pos_arr)))               # 0..1
    avg_hold = _avg_hold_bars(pos_arr)
    sharpe = _ann_sharpe_from_rewards(r)

    return {
        "steps": len(r),
        "mean_reward": float(r.mean()),
        "sharpe": sharpe,
        "total_return_pct": 100*total_return,
        "max_drawdown_pct": 100*dd,
        "hit_rate": float(np.mean(r > 0.0)),
        "trades": true_trades,
        "flips": flips,
        "avg_hold_bars": avg_hold,
        "exposure": exposure,
        "total_cost_bps": float(1e4*np.sum(costs))  # ประมาณค่าธรรมเนียมรวม (bp)
    }


In [None]:
# ประเมินใหม่ด้วย eval_agent ที่แพตช์แล้ว (นับ trades จริง + costs)
val_m  = eval_agent(env_valid, agent)
test_m = eval_agent(env_test,  agent)

def brief(m, name):
    print(f"{name}: sharpe={m['sharpe']:.2f}, ret={m['total_return_pct']:.2f}%"
          f", mdd={m['max_drawdown_pct']:.2f}%"
          f", trades={m['trades']}, flips={m['flips']}"
          f", avg_hold={m['avg_hold_bars']:.1f} bars"
          f", exposure={m['exposure']:.2f}"
          f", total_cost≈{m['total_cost_bps']:.0f} bp")
brief(val_m,  "VALID")
brief(test_m, "TEST")


VALID: sharpe=1.46, ret=8.18%, mdd=-2.00%, trades=42, flips=0, avg_hold=7.9 bars, exposure=0.04, total_cost≈61 bp
TEST: sharpe=0.46, ret=0.82%, mdd=-0.47%, trades=10, flips=0, avg_hold=4.2 bars, exposure=0.01, total_cost≈14 bp


In [None]:
# ===== PATCH D: Holding cost ต่อบาร์ =====
from dataclasses import dataclass
import numpy as np

@dataclass
class TradingEnvConfig:
    spread_pips: float = 1.5
    pip_size: float = 1e-4
    reward_clip: float = 0.01
    min_hold: int = 2
    churn_penalty_mult: float = 1.0
    seed: int = 123
    max_steps: int = None
    # ---- เพิ่มเติม ----
    obs_add_pos: bool = True
    obs_add_hold: bool = True
    hold_norm: int = 8
    hold_cost_bps_per_bar: float = 0.25   # ← เริ่มที่ 0.25 bp ต่อบาร์ (0.000025)

class TradingEnv:
    def __init__(self, X, prices, logrets, cfg: TradingEnvConfig):
        self.X, self.prices, self.rets, self.cfg = X, prices, logrets, cfg
        self.base_features = X.shape[1]
        extra = (1 if cfg.obs_add_pos else 0) + (1 if cfg.obs_add_hold else 0)
        self.n_features = self.base_features + extra
        self.n = len(X)
        self.reset()

    def _half_spread_frac(self, price: float) -> float:
        return 0.5 * (self.cfg.spread_pips * self.cfg.pip_size) / max(price, 1e-12)

    def _trade_cost_frac(self, old_pos: int, new_pos: int, price: float) -> float:
        if new_pos == old_pos:
            return 0.0
        half = self._half_spread_frac(price)
        base = half if (old_pos == 0 or new_pos == 0) else 2*half
        extra = self.cfg.churn_penalty_mult * half
        return base + extra

    def _obs(self):
        extras = []
        if self.cfg.obs_add_pos:
            extras.append(float(self.pos))
        if self.cfg.obs_add_hold:
            extras.append(float(self.hold_left)/self.cfg.hold_norm)
        if extras:
            return np.concatenate([self.X[self.t], np.array(extras, dtype=np.float32)])
        return self.X[self.t]

    def reset(self):
        self.t = 0
        self.pos = 0
        self.equity = 0.0
        self.hold_left = 0
        self.done = False
        return self._obs().astype(np.float32)

    def step(self, action: int):
        assert action in (0,1,2)
        if self.done:
            raise RuntimeError("Call reset() first")
        want_pos = action - 1
        if self.hold_left > 0 and want_pos != self.pos:
            want_pos = self.pos

        price_t = self.prices[self.t]
        # ค่าธรรมเนียมเปลี่ยนสถานะ (สเปรด + โทษ churn)
        switch_cost = self._trade_cost_frac(self.pos, want_pos, price_t)
        # ค่า "เช่าการถือสถานะ" ต่อบาร์ (คงที่ ไม่ขึ้นกับราคา)
        hold_cost = (abs(self.pos) > 0) * (self.cfg.hold_cost_bps_per_bar * 1e-4)

        # reward = กำไรจากตำแหน่งก่อนหน้า - ค่าธรรมเนียม
        reward = (self.pos * self.rets[self.t]) - switch_cost - hold_cost
        if self.cfg.reward_clip is not None:
            reward = float(np.clip(reward, -self.cfg.reward_clip, self.cfg.reward_clip))

        changed = (want_pos != self.pos)
        self.pos = want_pos
        self.equity += reward
        self.t += 1

        if changed: self.hold_left = self.cfg.min_hold
        else:       self.hold_left = max(0, self.hold_left - 1)

        if self.t >= self.n - 1 or (self.cfg.max_steps and self.t >= self.cfg.max_steps):
            self.done = True

        obs = self._obs().astype(np.float32)
        info = {"t": self.t, "pos": self.pos, "equity": self.equity,
                "price": price_t, "last_cost": switch_cost + hold_cost}
        return obs, float(reward), self.done, info


In [None]:
# สร้าง env ใหม่ (ใช้ X_*/P_*/R_* และ scaler เดิม)
cfg = TradingEnvConfig(
    spread_pips=1.5, reward_clip=0.01,
    min_hold=2, churn_penalty_mult=1.0,
    obs_add_pos=True, obs_add_hold=True, hold_norm=8,
    hold_cost_bps_per_bar=0.25   # ลอง 0.25 bp/bar ก่อน
)
env_train = TradingEnv(X_train, P_train, R_train, cfg)
env_valid = TradingEnv(X_valid, P_valid, R_valid, cfg)
env_test  = TradingEnv(X_test,  P_test,  R_test,  cfg)

# เทรน (ใช้ DQN v2 config เดิม)
cfg_dqn.episodes = 10
agent = DQNAgent(env_train.n_features, n_actions=3, cfg=cfg_dqn)
agent = train_dqn(env_train, env_valid, agent, cfg_dqn)

# ประเมิน
val_m  = eval_agent(env_valid, agent)
test_m = eval_agent(env_test,  agent)
brief(val_m,  "VALID (hold cost)")
brief(test_m, "TEST  (hold cost)")


[Ep 1] train_reward=-1.1119 | valid_sharpe=-1.605 | valid_ret=-21.42% | trades=754 | mean_loss=0.00872
[Ep 2] train_reward=-0.8671 | valid_sharpe=-1.181 | valid_ret=-18.10% | trades=673 | mean_loss=0.00038
[Ep 3] train_reward=-1.4546 | valid_sharpe=-0.922 | valid_ret=-11.11% | trades=462 | mean_loss=0.00024
[Ep 4] train_reward=-1.1564 | valid_sharpe=-1.175 | valid_ret=-20.90% | trades=1 | mean_loss=0.00009
[Ep 5] train_reward=-0.9505 | valid_sharpe=-0.005 | valid_ret=-0.08% | trades=336 | mean_loss=0.00003
[Ep 6] train_reward=-0.9574 | valid_sharpe=0.208 | valid_ret=4.23% | trades=1 | mean_loss=0.00001
[Ep 7] train_reward=-1.1723 | valid_sharpe=0.000 | valid_ret=0.00% | trades=0 | mean_loss=0.00000
[Ep 8] train_reward=-0.6662 | valid_sharpe=-1.175 | valid_ret=-20.90% | trades=1 | mean_loss=0.00000
[Ep 9] train_reward=-0.4933 | valid_sharpe=0.208 | valid_ret=4.23% | trades=1 | mean_loss=0.00000
[Ep 10] train_reward=-1.0592 | valid_sharpe=-0.570 | valid_ret=-10.62% | trades=661 | mean_lo

In [None]:
# ===== Semi-auto tuning: hold_cost_bps_per_bar =====
def run_trial(hold_cost_bp, episodes=6, seed=123):
    # 1) build envs with given holding cost
    cfg = TradingEnvConfig(
        spread_pips=1.5, reward_clip=0.01,
        min_hold=2, churn_penalty_mult=1.0,
        obs_add_pos=True, obs_add_hold=True, hold_norm=8,
        hold_cost_bps_per_bar=hold_cost_bp,
        seed=seed
    )
    e_tr = TradingEnv(X_train, P_train, R_train, cfg)
    e_va = TradingEnv(X_valid, P_valid, R_valid, cfg)
    e_te = TradingEnv(X_test,  P_test,  R_test,  cfg)

    # 2) agent & train (เบาๆให้ไว)
    cfg_dqn_local = copy.deepcopy(cfg_dqn)
    cfg_dqn_local.episodes = episodes
    agent = DQNAgent(e_tr.n_features, n_actions=3, cfg=cfg_dqn_local)
    agent = train_dqn(e_tr, e_va, agent, cfg_dqn_local)

    # 3) evaluate with patched metrics (จริงกว่า)
    vm = eval_agent(e_va, agent)
    tm = eval_agent(e_te, agent)
    return vm, tm

candidates = [0.15, 0.25, 0.35, 0.50, 0.75, 1.00]
results = []
for hp in candidates:
    vm, tm = run_trial(hp, episodes=6, seed=123)
    results.append({
        "hold_cost_bp": hp,
        "val_sharpe": vm["sharpe"], "test_sharpe": tm["sharpe"],
        "val_expo": vm["exposure"], "test_expo": tm["exposure"],
        "val_trades": vm["trades"], "test_trades": tm["trades"]
    })

# ดูผลแบบเรียงตาม test_sharpe
results_sorted = sorted(results, key=lambda x: x["test_sharpe"], reverse=True)
for r in results_sorted:
    print(
        f"hold_cost={r['hold_cost_bp']:.2f} bp/bar | "
        f"VAL S={r['val_sharpe']:.2f} expo={r['val_expo']:.2f} trades={r['val_trades']} || "
        f"TEST S={r['test_sharpe']:.2f} expo={r['test_expo']:.2f} trades={r['test_trades']}"
    )


[Ep 1] train_reward=-1.0713 | valid_sharpe=-0.592 | valid_ret=-9.81% | trades=752 | mean_loss=0.01106
[Ep 2] train_reward=-0.8650 | valid_sharpe=-1.861 | valid_ret=-30.11% | trades=746 | mean_loss=0.00056
[Ep 3] train_reward=-0.9533 | valid_sharpe=-0.952 | valid_ret=-10.59% | trades=632 | mean_loss=0.00053
[Ep 4] train_reward=-0.7629 | valid_sharpe=-1.346 | valid_ret=-23.14% | trades=233 | mean_loss=0.00024
[Ep 5] train_reward=-0.6626 | valid_sharpe=0.401 | valid_ret=8.32% | trades=1 | mean_loss=0.00008
[Ep 6] train_reward=-0.5973 | valid_sharpe=-1.176 | valid_ret=-20.95% | trades=209 | mean_loss=0.00003
Saved: /content/drive/MyDrive/IS_Safem0de/dqn_fx_v2.pt
[Ep 1] train_reward=-1.4161 | valid_sharpe=-1.656 | valid_ret=-26.79% | trades=216 | mean_loss=0.02194
[Ep 2] train_reward=-1.3681 | valid_sharpe=-1.175 | valid_ret=-20.90% | trades=1 | mean_loss=0.00004
[Ep 3] train_reward=-1.1502 | valid_sharpe=0.000 | valid_ret=0.00% | trades=0 | mean_loss=0.00001
[Ep 4] train_reward=-0.9571 | v

In [None]:
# (ทางเลือก) grid เล็กๆ: hold_cost x min_hold
c_hold = [0.25, 0.50, 0.75, 1.00]
c_holdmin = [1, 2, 3]
grid = []
for hp in c_hold:
    for mh in c_holdmin:
        cfg = TradingEnvConfig(
            spread_pips=1.5, reward_clip=0.01,
            min_hold=mh, churn_penalty_mult=1.0,
            obs_add_pos=True, obs_add_hold=True, hold_norm=8,
            hold_cost_bps_per_bar=hp, seed=123
        )
        e_tr = TradingEnv(X_train, P_train, R_train, cfg)
        e_va = TradingEnv(X_valid, P_valid, R_valid, cfg)
        e_te = TradingEnv(X_test,  P_test,  R_test,  cfg)

        cfg_dqn_local = copy.deepcopy(cfg_dqn); cfg_dqn_local.episodes = 5
        agent = DQNAgent(e_tr.n_features, n_actions=3, cfg=cfg_dqn_local)
        agent = train_dqn(e_tr, e_va, agent, cfg_dqn_local)

        vm = eval_agent(e_va, agent); tm = eval_agent(e_te, agent)
        grid.append((hp, mh, vm["sharpe"], tm["sharpe"], vm["exposure"], tm["exposure"]))

grid = sorted(grid, key=lambda x: x[3], reverse=True)
for hp, mh, vs, ts, ve, te in grid:
    print(f"hold_cost={hp:.2f}, min_hold={mh} | VAL S={vs:.2f}, TEST S={ts:.2f} | expo V/T={ve:.2f}/{te:.2f}")


[Ep 1] train_reward=-1.8132 | valid_sharpe=-2.144 | valid_ret=-27.06% | trades=1322 | mean_loss=0.01240
[Ep 2] train_reward=-1.4721 | valid_sharpe=-0.995 | valid_ret=-14.96% | trades=1070 | mean_loss=0.00042
[Ep 3] train_reward=-0.7987 | valid_sharpe=0.558 | valid_ret=10.54% | trades=155 | mean_loss=0.00030
[Ep 4] train_reward=-1.2461 | valid_sharpe=0.024 | valid_ret=0.36% | trades=367 | mean_loss=0.00013
[Ep 5] train_reward=-1.5936 | valid_sharpe=0.084 | valid_ret=0.72% | trades=220 | mean_loss=0.00006
Saved: /content/drive/MyDrive/IS_Safem0de/dqn_fx_v2.pt
[Ep 1] train_reward=-0.5375 | valid_sharpe=-2.233 | valid_ret=-33.86% | trades=415 | mean_loss=0.01057
[Ep 2] train_reward=-1.2398 | valid_sharpe=-1.661 | valid_ret=-19.88% | trades=327 | mean_loss=0.00041
[Ep 3] train_reward=-0.5328 | valid_sharpe=-1.232 | valid_ret=-21.34% | trades=57 | mean_loss=0.00016
[Ep 4] train_reward=-0.7371 | valid_sharpe=0.208 | valid_ret=4.23% | trades=1 | mean_loss=0.00008
[Ep 5] train_reward=-1.0961 | 