In [None]:
!pip install sb3-contrib


In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path
import torch as th
import numpy as np
import random
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat_simple import ForexTradingEnv
# from trading.environments.forex_env2_flat_simple import ForexTradingEnv2 as ForexTradingEnv
# from trading.environments.forex_env_flat_multi_pair import MultipairForexTradingEnv

from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


th.set_num_threads(3)
N_ENVS = 3  # Number of parallel environments
EVAL_FREUQENCY = 500_000
EVAL_FREQ_ADJUSTED = int(EVAL_FREUQENCY / N_ENVS)

hourly_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/"
source_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/df_with_all_indics_unbiased/not_norm/train2/'
source_dfs = [os.path.join(hourly_dir, f) for f in os.listdir(hourly_dir) if f.endswith('.parquet') and not f.startswith('.') and 'validate' not in f]

eval_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/EUR_GBP_validate.parquet'
sequence = 5
saving_path = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/results/'
os.makedirs(saving_path, exist_ok=True)

def set_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    th.manual_seed(seed)
    th.backends.cudnn.deterministic = True

set_all_seeds(42)

class ForexTensorboardCallback(BaseCallback):
    """Custom callback for logging Forex trading metrics to tensorboard."""
    
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_returns = []  # Track episode returns for averaging
        
    def _on_step(self) -> bool:
        """Called after each step in the environment."""
        # infos is a list of dictionaries, one from each parallel environment
        for info in self.locals['infos']:
            if info is None:  # Skip if no info (can happen at episode boundaries)
                continue
                
            # Log account metrics
            self.logger.record("metrics/balance", info['balance'])
            # self.logger.record("metrics/total_return_pct", info['total_return_pct'])
            # self.logger.record("metrics/net_profit", info['net_profit'])
            
            # Log trade metrics
            # self.logger.record("metrics/total_pnl", info['total_pnl'])
            # self.logger.record("metrics/total_trades", info['total_trades'])
            # self.logger.record("metrics/win_rate", info['win_rate'])
            
            # Log cost metrics
            self.logger.record("metrics/transaction_costs", info['transaction_costs'])
            # self.logger.record("metrics/transaction_costs_pct", info['transaction_costs_pct'])
            
            # Log position metrics
            self.logger.record("metrics/position_size_pct", info['position_size_pct'])
            
        return True
    
    def _on_rollout_end(self) -> None:
        """Called at the end of a rollout."""
        # Episode metrics are handled automatically by stable-baselines3
        pass

class DetailedEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """
        Performs evaluation with detailed metric logging throughout the evaluation episodes.
        """
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Store episode rewards for calculating mean
            episode_rewards = []
            episode_lengths = []
            
            # For each evaluation episode
            for _ in range(self.n_eval_episodes):
                episode_reward = 0
                episode_length = 0
                done = False
                # VecEnv reset returns just the obs
                obs = self.eval_env.reset()
                
                # Run episode until done
                while not done:
                    # Get deterministic action
                    action, _ = self.model.predict(obs, deterministic=True)
                    # VecEnv step returns (obs, reward, done, info)
                    obs, reward, done, info = self.eval_env.step(action)
                    episode_reward += reward[0]  # reward is a numpy array
                    episode_length += 1
                    
                    # Log metrics at each step
                    if info[0] is not None:  # info is a list of dicts
                        info = info[0]  # Get info dict from first env
                        self.logger.record("eval/balance", info.get('balance', 0))
                        self.logger.record("eval/total_pnl", info.get('total_pnl', 0))
                        # self.logger.record("eval/total_trades", info.get('total_trades', 0))
                        # self.logger.record("eval/win_rate", info.get('win_rate', 0))
                        self.logger.record("eval/transaction_costs", info.get('transaction_costs', 0))
                        # Dump metrics at each step
                        self.logger.dump(self.n_calls)
                
                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

            # Calculate mean metrics across episodes
            mean_reward = np.mean(episode_rewards)
            mean_length = np.mean(episode_lengths)
            
            self.logger.record("eval/mean_reward", mean_reward)
            self.logger.record("eval/mean_episode_length", mean_length)

            # Update best model if needed
            if self.best_model_save_path is not None:
                if self.verbose >= 1:
                    print(f"Evaluating the current model: {mean_reward:.2f}")
                
                if mean_reward > self.best_mean_reward:
                    if self.verbose >= 1:
                        print(f"New best mean reward: {mean_reward:.2f} "
                              f"(previous: {self.best_mean_reward:.2f})")
                    self.best_mean_reward = mean_reward
                    self.model.save(self.best_model_save_path)

        return True

    def _get_eval_info(self):
        """Helper method to get the last info dict from eval environment."""
        try:
            # Try to get info directly from environment
            if hasattr(self.eval_env, 'get_info'):
                return self.eval_env.get_info()
            # If that's not available, try to get it from the unwrapped env
            elif hasattr(self.eval_env, 'envs'):
                return self.eval_env.envs[0].get_info()
            return None
        except Exception as e:
            print(f"Warning: Could not get eval info: {e}")
            return None


def make_train_env(rank):
    def _init():
        env = ForexTradingEnv(
            df_paths=source_dfs,
            eval_mode=False,
            sequence_length=sequence,
        )
        env = Monitor(env)
        return env
    return _init


train_env = SubprocVecEnv([make_train_env(i) for i in range(N_ENVS)])
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True)



def make_eval_env():
    env = ForexTradingEnv(
        df_paths=source_dfs,
        eval_path=eval_path,
        eval_mode=True,
        pair='EUR_GBP',
        sequence_length=sequence,


    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env


eval_env = make_eval_env()

eval_callback = DetailedEvalCallback(
    eval_env,
    best_model_save_path=f'{saving_path}eval_best_model_new_reward/',
    log_path=saving_path,
    eval_freq=EVAL_FREQ_ADJUSTED,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

# eval_callback = EvalCallback(
#     eval_env,
#     best_model_save_path=saving_path,
#     log_path=saving_path,
#     eval_freq=EVAL_FREQ_ADJUSTED,  # Adjust as needed
#     n_eval_episodes=5,
#     deterministic=True,
#     render=False
# )

# model = PPO(
#     'MlpPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_LSTM',
# )
# Define policy kwargs for the LSTM configuration
# policy_kwargs = dict(
#     # Network Architecture
#     net_arch=dict(
#         # Actor (policy) network
#         pi=[256, 128],  # Larger first layer to process high-dimensional input
#         # Critic (value) network
#         vf=[256, 128]   # Match actor architecture for balanced learning
#     ),
    
#     # LSTM Configuration
#     lstm_hidden_size=256,      # Larger hidden size to capture complex patterns
#     n_lstm_layers=2,           # Multiple layers for hierarchical feature learning
#     enable_critic_lstm=True,   # Share temporal understanding between actor and critic
    
#     # LSTM specific parameters
#     lstm_kwargs=dict(
#         dropout=0.2            # Slightly higher dropout for regularization
#     )
# )

policy_kwargs_complex = dict(
    net_arch=dict(
        pi=[512, 256, 128],
        vf=[512, 256, 128]
    ),
    lstm_hidden_size=512,
    n_lstm_layers=3,
    enable_critic_lstm=True,
    lstm_kwargs=dict(
        dropout=0.25
    )
)

policy_kwargs_memory_efficient = dict(
    net_arch=dict(
        pi=[256, 128],
        vf=[256, 128]
    ),
    lstm_hidden_size=256,
    n_lstm_layers=1,
    lstm_kwargs=dict(
        dropout=0.1
    )
)

model = RecurrentPPO(
    'MlpLstmPolicy',
    train_env,
    verbose=0,
    seed=42,
    tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_LSTM2/',
    policy_kwargs=policy_kwargs_memory_efficient,
)
callbacks = [
    ForexTensorboardCallback(),
    eval_callback
]

model.learn(
    total_timesteps=10_000_000,  # Adjust as needed
    callback=callbacks
)

model.save(f'{saving_path}{sequence}_best_model_core.zip')
train_env.save(f'{saving_path}{sequence}_vec_normalize_core.pkl')


In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path
import torch as th
import numpy as np

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat_simple import ForexTradingEnv
# from trading.environments.forex_env2_flat_simple import ForexTradingEnv2 as ForexTradingEnv
# from trading.environments.forex_env_flat_multi_pair import MultipairForexTradingEnv

from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


th.set_num_threads(1)
N_ENVS = 1  # Number of parallel environments
EVAL_FREUQENCY = 200_000
EVAL_FREQ_ADJUSTED = int(EVAL_FREUQENCY / N_ENVS)

hourly_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/"
source_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/df_with_all_indics_unbiased/not_norm/train2/'
source_dfs = [os.path.join(hourly_dir, f) for f in os.listdir(hourly_dir) if f.endswith('.parquet') and not f.startswith('.') and 'validate' not in f]

eval_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/EUR_GBP_validate.parquet'
sequence = 5
saving_path = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/results'
os.makedirs(saving_path, exist_ok=True)


class ForexTensorboardCallback(BaseCallback):
    """Custom callback for logging Forex trading metrics to tensorboard."""
    
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_returns = []  # Track episode returns for averaging
        
    def _on_step(self) -> bool:
        """Called after each step in the environment."""
        # infos is a list of dictionaries, one from each parallel environment
        for info in self.locals['infos']:
            if info is None:  # Skip if no info (can happen at episode boundaries)
                continue
                
            # Log account metrics
            self.logger.record("metrics/balance", info['balance'])
            # self.logger.record("metrics/total_return_pct", info['total_return_pct'])
            # self.logger.record("metrics/net_profit", info['net_profit'])
            
            # Log trade metrics
            # self.logger.record("metrics/total_pnl", info['total_pnl'])
            self.logger.record("metrics/total_trades", info['total_trades'])
            # self.logger.record("metrics/win_rate", info['win_rate'])
            
            # Log cost metrics
            self.logger.record("metrics/transaction_costs", info['transaction_costs'])
            # self.logger.record("metrics/transaction_costs_pct", info['transaction_costs_pct'])
            
            # Log position metrics
            self.logger.record("metrics/position_size_pct", info['position_size_pct'])
            
        return True
    
    def _on_rollout_end(self) -> None:
        """Called at the end of a rollout."""
        # Episode metrics are handled automatically by stable-baselines3
        pass

class DetailedEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """
        Performs evaluation with detailed metric logging throughout the evaluation episodes.
        """
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Store episode rewards for calculating mean
            episode_rewards = []
            episode_lengths = []
            
            # For each evaluation episode
            for _ in range(self.n_eval_episodes):
                episode_reward = 0
                episode_length = 0
                done = False
                # VecEnv reset returns just the obs
                obs = self.eval_env.reset()
                
                # Run episode until done
                while not done:
                    # Get deterministic action
                    action, _ = self.model.predict(obs, deterministic=True)
                    # VecEnv step returns (obs, reward, done, info)
                    obs, reward, done, info = self.eval_env.step(action)
                    episode_reward += reward[0]  # reward is a numpy array
                    episode_length += 1
                    
                    # Log metrics at each step
                    if info[0] is not None:  # info is a list of dicts
                        info = info[0]  # Get info dict from first env
                        self.logger.record("eval/balance", info.get('balance', 0))
                        self.logger.record("eval/total_pnl", info.get('total_pnl', 0))
                        self.logger.record("eval/total_trades", info.get('total_trades', 0))
                        # self.logger.record("eval/win_rate", info.get('win_rate', 0))
                        self.logger.record("eval/transaction_costs", info.get('transaction_costs', 0))
                        # Dump metrics at each step
                        self.logger.dump(self.n_calls)
                
                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

            # Calculate mean metrics across episodes
            mean_reward = np.mean(episode_rewards)
            mean_length = np.mean(episode_lengths)
            
            self.logger.record("eval/mean_reward", mean_reward)
            self.logger.record("eval/mean_episode_length", mean_length)

            # Update best model if needed
            if self.best_model_save_path is not None:
                if self.verbose >= 1:
                    print(f"Evaluating the current model: {mean_reward:.2f}")
                
                if mean_reward > self.best_mean_reward:
                    if self.verbose >= 1:
                        print(f"New best mean reward: {mean_reward:.2f} "
                              f"(previous: {self.best_mean_reward:.2f})")
                    self.best_mean_reward = mean_reward
                    self.model.save(self.best_model_save_path)

        return True

    def _get_eval_info(self):
        """Helper method to get the last info dict from eval environment."""
        try:
            # Try to get info directly from environment
            if hasattr(self.eval_env, 'get_info'):
                return self.eval_env.get_info()
            # If that's not available, try to get it from the unwrapped env
            elif hasattr(self.eval_env, 'envs'):
                return self.eval_env.envs[0].get_info()
            return None
        except Exception as e:
            print(f"Warning: Could not get eval info: {e}")
            return None


def make_train_env(rank):
    def _init():
        env = ForexTradingEnv(
            df_paths=source_dfs,
            eval_mode=False,
            sequence_length=sequence,
        )
        env = Monitor(env)
        return env
    return _init


train_env = SubprocVecEnv([make_train_env(i) for i in range(N_ENVS)])
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True)



def make_eval_env():
    env = ForexTradingEnv(
        df_paths=source_dfs,
        eval_path=eval_path,
        eval_mode=True,
        pair='EUR_GBP',
        sequence_length=sequence,


    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env


eval_env = make_eval_env()

eval_callback = DetailedEvalCallback(
    eval_env,
    best_model_save_path=f'{saving_path}eval_best_model_new_reward/',
    log_path=saving_path,
    eval_freq=EVAL_FREQ_ADJUSTED,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

# eval_callback = EvalCallback(
#     eval_env,
#     best_model_save_path=saving_path,
#     log_path=saving_path,
#     eval_freq=EVAL_FREQ_ADJUSTED,  # Adjust as needed
#     n_eval_episodes=5,
#     deterministic=True,
#     render=False
# )

# model = PPO(
#     'MlpPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_indics2',
# )

model = PPO(
    'MlpPolicy',
    train_env,
    learning_rate=5e-5,  # Reduced from 3e-4
    n_steps=4096,        # Increased from 2048
    batch_size=256,      # Increased from 64
    n_epochs=20,         # Increased from 10
    ent_coef=0.01,      # Added to encourage exploration
    # Clip range is important for stability
    clip_range=0.1,      # Reduced from 0.2 default
    # Add value function clipping
    clip_range_vf=0.1,
    verbose=0,
    tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_indics_claude_params/',
)
# Define policy kwargs for the LSTM configuration
# policy_kwargs = dict(
#     # Network Architecture
#     net_arch=dict(
#         # Actor (policy) network
#         pi=[256, 128],  # Larger first layer to process high-dimensional input
#         # Critic (value) network
#         vf=[256, 128]   # Match actor architecture for balanced learning
#     ),
    
#     # LSTM Configuration
#     lstm_hidden_size=256,      # Larger hidden size to capture complex patterns
#     n_lstm_layers=2,           # Multiple layers for hierarchical feature learning
#     enable_critic_lstm=True,   # Share temporal understanding between actor and critic
    
#     # LSTM specific parameters
#     lstm_kwargs=dict(
#         dropout=0.2            # Slightly higher dropout for regularization
#     )
# )

policy_kwargs_complex = dict(
    net_arch=dict(
        pi=[512, 256, 128],
        vf=[512, 256, 128]
    ),
    lstm_hidden_size=512,
    n_lstm_layers=3,
    enable_critic_lstm=True,
    lstm_kwargs=dict(
        dropout=0.25
    )
)

policy_kwargs_memory_efficient = dict(
    net_arch=dict(
        pi=[256, 128],
        vf=[256, 128]
    ),
    lstm_hidden_size=256,
    n_lstm_layers=1,
    lstm_kwargs=dict(
        dropout=0.1
    )
)

# model = RecurrentPPO(
#     'MlpLstmPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}_RecurrentPPO_memory efficient/',
#     policy_kwargs=policy_kwargs_memory_efficient,
# )
callbacks = [
    ForexTensorboardCallback(),
    eval_callback
]

model.learn(
    total_timesteps=10_000_000,  # Adjust as needed
    callback=callbacks
)

model.save(f'{saving_path}{sequence}_best_model_core.zip')
train_env.save(f'{saving_path}{sequence}_vec_normalize_core.pkl')


In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path
import torch as th
import numpy as np

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat_simple import ForexTradingEnv
# from trading.environments.forex_env2_flat_simple import ForexTradingEnv2 as ForexTradingEnv
# from trading.environments.forex_env_flat_multi_pair import MultipairForexTradingEnv

from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


th.set_num_threads(3)
N_ENVS = 3  # Number of parallel environments
EVAL_FREUQENCY = 500_000
EVAL_FREQ_ADJUSTED = int(EVAL_FREUQENCY / N_ENVS)

hourly_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/"
source_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/df_with_all_indics_unbiased/not_norm/train2/'
source_dfs = [os.path.join(hourly_dir, f) for f in os.listdir(hourly_dir) if f.endswith('.parquet') and not f.startswith('.') and 'validate' not in f]

eval_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/EUR_GBP_validate.parquet'
sequence = 5
saving_path = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/results/'
os.makedirs(saving_path, exist_ok=True)


class ForexTensorboardCallback(BaseCallback):
    """Custom callback for logging Forex trading metrics to tensorboard."""
    
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_returns = []  # Track episode returns for averaging
        
    def _on_step(self) -> bool:
        """Called after each step in the environment."""
        # infos is a list of dictionaries, one from each parallel environment
        for info in self.locals['infos']:
            if info is None:  # Skip if no info (can happen at episode boundaries)
                continue
                
            # Log account metrics
            self.logger.record("metrics/balance", info['balance'])
            # self.logger.record("metrics/total_return_pct", info['total_return_pct'])
            # self.logger.record("metrics/net_profit", info['net_profit'])
            
            # Log trade metrics
            # self.logger.record("metrics/total_pnl", info['total_pnl'])
            self.logger.record("metrics/total_trades", info['total_trades'])
            # self.logger.record("metrics/win_rate", info['win_rate'])
            
            # Log cost metrics
            self.logger.record("metrics/transaction_costs", info['transaction_costs'])
            # self.logger.record("metrics/transaction_costs_pct", info['transaction_costs_pct'])
            
            # Log position metrics
            self.logger.record("metrics/position_size_pct", info['position_size_pct'])
            
        return True
    
    def _on_rollout_end(self) -> None:
        """Called at the end of a rollout."""
        # Episode metrics are handled automatically by stable-baselines3
        pass

class DetailedEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """
        Performs evaluation with detailed metric logging throughout the evaluation episodes.
        """
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Store episode rewards for calculating mean
            episode_rewards = []
            episode_lengths = []
            
            # For each evaluation episode
            for _ in range(self.n_eval_episodes):
                episode_reward = 0
                episode_length = 0
                done = False
                # VecEnv reset returns just the obs
                obs = self.eval_env.reset()
                
                # Run episode until done
                while not done:
                    # Get deterministic action
                    action, _ = self.model.predict(obs, deterministic=True)
                    # VecEnv step returns (obs, reward, done, info)
                    obs, reward, done, info = self.eval_env.step(action)
                    episode_reward += reward[0]  # reward is a numpy array
                    episode_length += 1
                    
                    # Log metrics at each step
                    if info[0] is not None:  # info is a list of dicts
                        info = info[0]  # Get info dict from first env
                        self.logger.record("eval/balance", info.get('balance', 0))
                        self.logger.record("eval/total_pnl", info.get('total_pnl', 0))
                        self.logger.record("eval/total_trades", info.get('total_trades', 0))
                        # self.logger.record("eval/win_rate", info.get('win_rate', 0))
                        self.logger.record("eval/transaction_costs", info.get('transaction_costs', 0))
                        # Dump metrics at each step
                        self.logger.dump(self.n_calls)
                
                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

            # Calculate mean metrics across episodes
            mean_reward = np.mean(episode_rewards)
            mean_length = np.mean(episode_lengths)
            
            self.logger.record("eval/mean_reward", mean_reward)
            self.logger.record("eval/mean_episode_length", mean_length)

            # Update best model if needed
            if self.best_model_save_path is not None:
                if self.verbose >= 1:
                    print(f"Evaluating the current model: {mean_reward:.2f}")
                
                if mean_reward > self.best_mean_reward:
                    if self.verbose >= 1:
                        print(f"New best mean reward: {mean_reward:.2f} "
                              f"(previous: {self.best_mean_reward:.2f})")
                    self.best_mean_reward = mean_reward
                    self.model.save(self.best_model_save_path)

        return True

    def _get_eval_info(self):
        """Helper method to get the last info dict from eval environment."""
        try:
            # Try to get info directly from environment
            if hasattr(self.eval_env, 'get_info'):
                return self.eval_env.get_info()
            # If that's not available, try to get it from the unwrapped env
            elif hasattr(self.eval_env, 'envs'):
                return self.eval_env.envs[0].get_info()
            return None
        except Exception as e:
            print(f"Warning: Could not get eval info: {e}")
            return None


def make_train_env(rank):
    def _init():
        env = ForexTradingEnv(
            df_paths=source_dfs,
            eval_mode=False,
            sequence_length=sequence,
        )
        env = Monitor(env)
        return env
    return _init


train_env = SubprocVecEnv([make_train_env(i) for i in range(N_ENVS)])
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True)



def make_eval_env():
    env = ForexTradingEnv(
        df_paths=source_dfs,
        eval_path=eval_path,
        eval_mode=True,
        pair='EUR_GBP',
        sequence_length=sequence,


    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env


eval_env = make_eval_env()

eval_callback = DetailedEvalCallback(
    eval_env,
    best_model_save_path=f'{saving_path}eval_best_model_new_reward/',
    log_path=saving_path,
    eval_freq=EVAL_FREQ_ADJUSTED,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

# eval_callback = EvalCallback(
#     eval_env,
#     best_model_save_path=saving_path,
#     log_path=saving_path,
#     eval_freq=EVAL_FREQ_ADJUSTED,  # Adjust as needed
#     n_eval_episodes=5,
#     deterministic=True,
#     render=False
# )

# model = PPO(
#     'MlpPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_indics2',
# )

model = PPO(
    'MlpPolicy',
    train_env,
    learning_rate=5e-5,  # Reduced from 3e-4
    n_steps=4096,        # Increased from 2048
    batch_size=256,      # Increased from 64
    n_epochs=20,         # Increased from 10
    ent_coef=0.01,      # Added to encourage exploration
    # Clip range is important for stability
    clip_range=0.1,      # Reduced from 0.2 default
    # Add value function clipping
    clip_range_vf=0.1,
    verbose=0,
    tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_reduced_indics_claude_params/',
)
# Define policy kwargs for the LSTM configuration
# policy_kwargs = dict(
#     # Network Architecture
#     net_arch=dict(
#         # Actor (policy) network
#         pi=[256, 128],  # Larger first layer to process high-dimensional input
#         # Critic (value) network
#         vf=[256, 128]   # Match actor architecture for balanced learning
#     ),
    
#     # LSTM Configuration
#     lstm_hidden_size=256,      # Larger hidden size to capture complex patterns
#     n_lstm_layers=2,           # Multiple layers for hierarchical feature learning
#     enable_critic_lstm=True,   # Share temporal understanding between actor and critic
    
#     # LSTM specific parameters
#     lstm_kwargs=dict(
#         dropout=0.2            # Slightly higher dropout for regularization
#     )
# )

policy_kwargs_complex = dict(
    net_arch=dict(
        pi=[512, 256, 128],
        vf=[512, 256, 128]
    ),
    lstm_hidden_size=512,
    n_lstm_layers=3,
    enable_critic_lstm=True,
    lstm_kwargs=dict(
        dropout=0.25
    )
)

policy_kwargs_memory_efficient = dict(
    net_arch=dict(
        pi=[256, 128],
        vf=[256, 128]
    ),
    lstm_hidden_size=256,
    n_lstm_layers=1,
    lstm_kwargs=dict(
        dropout=0.1
    )
)

# model = RecurrentPPO(
#     'MlpLstmPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}_RecurrentPPO_memory efficient/',
#     policy_kwargs=policy_kwargs_memory_efficient,
# )
callbacks = [
    ForexTensorboardCallback(),
    eval_callback
]

model.learn(
    total_timesteps=10_000_000,  # Adjust as needed
    callback=callbacks
)

model.save(f'{saving_path}{sequence}_best_model_core.zip')
train_env.save(f'{saving_path}{sequence}_vec_normalize_core.pkl')


In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path
import torch as th
import numpy as np

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat_simple import ForexTradingEnv
# from trading.environments.forex_env2_flat_simple import ForexTradingEnv2 as ForexTradingEnv
# from trading.environments.forex_env_flat_multi_pair import MultipairForexTradingEnv

from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


th.set_num_threads(3)
N_ENVS = 3  # Number of parallel environments
EVAL_FREUQENCY = 500_000
EVAL_FREQ_ADJUSTED = int(EVAL_FREUQENCY / N_ENVS)

hourly_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train3_minimal_indic/"
source_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/df_with_all_indics_unbiased/not_norm/train2/'
source_dfs = [os.path.join(hourly_dir, f) for f in os.listdir(hourly_dir) if f.endswith('.parquet') and not f.startswith('.') and 'validate' not in f]

eval_path = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/df_with_all_indics_unbiased/not_norm/train2/EUR_GBP_validate.parquet'
sequence = 5
saving_path = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/unbiased/not_norm/train2/results/'
os.makedirs(saving_path, exist_ok=True)


class ForexTensorboardCallback(BaseCallback):
    """Custom callback for logging Forex trading metrics to tensorboard."""
    
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_returns = []  # Track episode returns for averaging
        
    def _on_step(self) -> bool:
        """Called after each step in the environment."""
        # infos is a list of dictionaries, one from each parallel environment
        for info in self.locals['infos']:
            if info is None:  # Skip if no info (can happen at episode boundaries)
                continue
                
            # Log account metrics
            self.logger.record("metrics/balance", info['balance'])
            # self.logger.record("metrics/total_return_pct", info['total_return_pct'])
            # self.logger.record("metrics/net_profit", info['net_profit'])
            
            # Log trade metrics
            # self.logger.record("metrics/total_pnl", info['total_pnl'])
            # self.logger.record("metrics/total_trades", info['total_trades'])
            # self.logger.record("metrics/win_rate", info['win_rate'])
            
            # Log cost metrics
            self.logger.record("metrics/transaction_costs", info['transaction_costs'])
            # self.logger.record("metrics/transaction_costs_pct", info['transaction_costs_pct'])
            
            # Log position metrics
            self.logger.record("metrics/position_size_pct", info['position_size_pct'])
            
        return True
    
    def _on_rollout_end(self) -> None:
        """Called at the end of a rollout."""
        # Episode metrics are handled automatically by stable-baselines3
        pass

class DetailedEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """
        Performs evaluation with detailed metric logging throughout the evaluation episodes.
        """
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Store episode rewards for calculating mean
            episode_rewards = []
            episode_lengths = []
            
            # For each evaluation episode
            for _ in range(self.n_eval_episodes):
                episode_reward = 0
                episode_length = 0
                done = False
                # VecEnv reset returns just the obs
                obs = self.eval_env.reset()
                
                # Run episode until done
                while not done:
                    # Get deterministic action
                    action, _ = self.model.predict(obs, deterministic=True)
                    # VecEnv step returns (obs, reward, done, info)
                    obs, reward, done, info = self.eval_env.step(action)
                    episode_reward += reward[0]  # reward is a numpy array
                    episode_length += 1
                    
                    # Log metrics at each step
                    if info[0] is not None:  # info is a list of dicts
                        info = info[0]  # Get info dict from first env
                        self.logger.record("eval/balance", info.get('balance', 0))
                        self.logger.record("eval/total_pnl", info.get('total_pnl', 0))
                        # self.logger.record("eval/total_trades", info.get('total_trades', 0))
                        # self.logger.record("eval/win_rate", info.get('win_rate', 0))
                        self.logger.record("eval/transaction_costs", info.get('transaction_costs', 0))
                        # Dump metrics at each step
                        self.logger.dump(self.n_calls)
                
                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

            # Calculate mean metrics across episodes
            mean_reward = np.mean(episode_rewards)
            mean_length = np.mean(episode_lengths)
            
            self.logger.record("eval/mean_reward", mean_reward)
            self.logger.record("eval/mean_episode_length", mean_length)

            # Update best model if needed
            if self.best_model_save_path is not None:
                if self.verbose >= 1:
                    print(f"Evaluating the current model: {mean_reward:.2f}")
                
                if mean_reward > self.best_mean_reward:
                    if self.verbose >= 1:
                        print(f"New best mean reward: {mean_reward:.2f} "
                              f"(previous: {self.best_mean_reward:.2f})")
                    self.best_mean_reward = mean_reward
                    self.model.save(self.best_model_save_path)

        return True

    def _get_eval_info(self):
        """Helper method to get the last info dict from eval environment."""
        try:
            # Try to get info directly from environment
            if hasattr(self.eval_env, 'get_info'):
                return self.eval_env.get_info()
            # If that's not available, try to get it from the unwrapped env
            elif hasattr(self.eval_env, 'envs'):
                return self.eval_env.envs[0].get_info()
            return None
        except Exception as e:
            print(f"Warning: Could not get eval info: {e}")
            return None


def make_train_env(rank):
    def _init():
        env = ForexTradingEnv(
            df_paths=source_dfs,
            eval_mode=False,
            sequence_length=sequence,
        )
        env = Monitor(env)
        return env
    return _init


train_env = SubprocVecEnv([make_train_env(i) for i in range(N_ENVS)])
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True)



def make_eval_env():
    env = ForexTradingEnv(
        df_paths=source_dfs,
        eval_path=eval_path,
        eval_mode=True,
        pair='EUR_GBP',
        sequence_length=sequence,


    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env


eval_env = make_eval_env()

eval_callback = DetailedEvalCallback(
    eval_env,
    best_model_save_path=f'{saving_path}eval_best_model_new_reward/',
    log_path=saving_path,
    eval_freq=EVAL_FREQ_ADJUSTED,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

# eval_callback = EvalCallback(
#     eval_env,
#     best_model_save_path=saving_path,
#     log_path=saving_path,
#     eval_freq=EVAL_FREQ_ADJUSTED,  # Adjust as needed
#     n_eval_episodes=5,
#     deterministic=True,
#     render=False
# )

model = PPO(
    'MlpPolicy',
    train_env,
    verbose=0,
    tensorboard_log=f'{saving_path}sequence_{sequence}__PPO_1h_no_costs_50k_balance_minimal_indic',
)
# Define policy kwargs for the LSTM configuration
# policy_kwargs = dict(
#     # Network Architecture
#     net_arch=dict(
#         # Actor (policy) network
#         pi=[256, 128],  # Larger first layer to process high-dimensional input
#         # Critic (value) network
#         vf=[256, 128]   # Match actor architecture for balanced learning
#     ),
    
#     # LSTM Configuration
#     lstm_hidden_size=256,      # Larger hidden size to capture complex patterns
#     n_lstm_layers=2,           # Multiple layers for hierarchical feature learning
#     enable_critic_lstm=True,   # Share temporal understanding between actor and critic
    
#     # LSTM specific parameters
#     lstm_kwargs=dict(
#         dropout=0.2            # Slightly higher dropout for regularization
#     )
# )

policy_kwargs_complex = dict(
    net_arch=dict(
        pi=[512, 256, 128],
        vf=[512, 256, 128]
    ),
    lstm_hidden_size=512,
    n_lstm_layers=3,
    enable_critic_lstm=True,
    lstm_kwargs=dict(
        dropout=0.25
    )
)

policy_kwargs_memory_efficient = dict(
    net_arch=dict(
        pi=[256, 128],
        vf=[256, 128]
    ),
    lstm_hidden_size=256,
    n_lstm_layers=1,
    lstm_kwargs=dict(
        dropout=0.1
    )
)

# model = RecurrentPPO(
#     'MlpLstmPolicy',
#     train_env,
#     verbose=0,
#     tensorboard_log=f'{saving_path}sequence_{sequence}_RecurrentPPO_memory efficient/',
#     policy_kwargs=policy_kwargs_memory_efficient,
# )
callbacks = [
    ForexTensorboardCallback(),
    eval_callback
]

model.learn(
    total_timesteps=10_000_000,  # Adjust as needed
    callback=callbacks
)

model.save(f'{saving_path}{sequence}_best_model_core.zip')
train_env.save(f'{saving_path}{sequence}_vec_normalize_core.pkl')
