In [None]:
import os
import sys

import pandas as pd
import numpy as np
import pytz

from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager
from visualization.chart_manager import ChartManager
from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()
dataset_manager = DatasetManager()
chart_manager = ChartManager()
indicator_manager = IndicatorManager()

currencies = [
            'EUR_USD'
        ]
# currencies = [
#             'GBP_CHF', 'GBP_JPY', 'EUR_CHF', 'EUR_JPY', 'USD_CHF',
#             'EUR_CAD', 'EUR_USD', 'GBP_USD', 'EUR_GBP', 'USD_JPY',
#             'USD_CAD', 'AUD_USD', 'CHF_JPY', 'AUD_JPY', 'NZD_USD',
#             'NZD_JPY', 'XAU_USD', 'XAG_USD'
#         ]

def prepare_unbiased_dataset(df_5min):
    """
    Prepare dataset without look-ahead bias by calculating indicators
    using only data available at each point in time.
    
    Args:
        df_5min: DataFrame with 5-minute OHLC data and UTC DatetimeIndex
    """
    # Validate input DataFrame has proper UTC DatetimeIndex
    if not isinstance(df_5min.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be DatetimeIndex")
    if df_5min.index.tz is None:
        raise ValueError("DataFrame index must be timezone-aware (UTC)")
    
    # Initialize result DataFrame with the same structure as input
    result_df = df_5min.copy()
    
    # Calculate indicators for the first day to get the actual columns that will be available
    first_day_data = df_5min.iloc[:288]  # First day of data (288 5-min candles in a day)
    first_day_daily = first_day_data.resample('D').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }).dropna()
    
    # Get actual indicator columns from first calculation
    initial_indicators = indicator_manager.calculate_indicators(first_day_daily, indicator_timeframe='1h')
    indicator_columns = initial_indicators.columns.tolist()
    
    print(f"Available indicators: {indicator_columns}")
    
    # Initialize indicator columns
    for col in indicator_columns:
        result_df[col] = np.nan
    
    # Get unique dates in UTC
    dates = pd.Series(df_5min.index.tz_localize(None).date).unique()
    total_dates = len(dates)
    
    for date_idx, date in enumerate(dates):
        print(f"Processing date: {date} ({date_idx + 1}/{total_dates})")
        
        # Create UTC-aware datetime bounds for the day
        day_start = pd.Timestamp(date).tz_localize('UTC')
        day_end = (day_start + pd.Timedelta(days=1))
        
        # Get data for this day
        day_mask = (df_5min.index >= day_start) & (df_5min.index < day_end)
        day_data = df_5min[day_mask]
        
        # For each 5-minute candle in the day
        for timestamp in day_data.index:
            # Get all data available up to this point in time
            available_data = df_5min[df_5min.index <= timestamp].copy()
            
            # Calculate the daily candle data using only available information
            available_daily = available_data.resample('D').agg({
                'open': 'first',
                'high': 'max',
                'low': 'min',
                'close': 'last'
            }).dropna()
            
            try:
                # Calculate indicators using only available data
                daily_indicators = indicator_manager.calculate_indicators(available_daily, indicator_timeframe='1h')
                
                # Get the most recent indicator values
                if not daily_indicators.empty:
                    current_indicators = daily_indicators.iloc[-1]
                    
                    # Store these indicator values for this specific timestamp
                    # Only use columns that exist in both DataFrames
                    available_columns = current_indicators.index.intersection(indicator_columns)
                    print(f"Available columns: {available_columns}")
                    result_df.loc[timestamp, available_columns] = current_indicators[available_columns]
            
            except Exception as e:
                print(f"Error calculating indicators for {timestamp}: {str(e)}")
                continue
            
            # Progress logging
            if timestamp.minute % 60 == 0:
                print(f"Processing: {timestamp}")
    
    # Verify no NaN values in result
    nan_counts = result_df[indicator_columns].isna().sum()
    if nan_counts.any():
        print("Warning: NaN values found in indicators:")
        print(nan_counts[nan_counts > 0])
    
    return result_df

# Main processing loop
for ccy in currencies:
    print(f"\nProcessing {ccy}...")
    source = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/{ccy}.parquet'
    
    try:
        # Read data and ensure UTC timezone
        df = pd.read_parquet(source)
        if df.index.tz is None:
            df.index = df.index.tz_localize('UTC')
        elif df.index.tz != pytz.UTC:
            df.index = df.index.tz_convert('UTC')
            
        # Resample to 5 minutes while preserving UTC timezone
        df_5min = df.resample('5min').agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
        }).dropna()
        
        print("Calculating unbiased indicators...")
        df_with_indicators = prepare_unbiased_dataset(df_5min)
        
        print("Normalizing data...")
        df_norm = processor.normalize_simple(df=df_with_indicators)
        
        output_path = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/unbiased/{ccy}_5T_indics_1D_norm_ubiased.parquet'
        print(f"Saving to {output_path}")
        df_norm.to_parquet(output_path)
        
        print(f"Completed processing {ccy}")
        
    except Exception as e:
        print(f"Error processing {ccy}: {str(e)}")
        continue


In [None]:
import os
import sys

import pandas as pd
import numpy as np
import pytz

from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager
from visualization.chart_manager import ChartManager
from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()
dataset_manager = DatasetManager()
chart_manager = ChartManager()
indicator_manager = IndicatorManager()

source = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/EUR_USD.parquet'
df = pd.read_parquet(source)


minute_5 = df.resample('5min').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last'
}).dropna()

# Get actual indicator columns from first calculation
df_with_indicators = indicator_manager.calculate_indicators(minute_5, indicator_timeframe='1h')
df_norm = processor.normalize_simple(df=df_with_indicators)
df_norm.columns
df_norm


In [None]:
import os, sys
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
import pandas as pd
import numpy as np
from pathlib import Path
import pytz
from typing import List, Optional


from data_management.indicator_manager import IndicatorManager

from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()

indicator_manager = IndicatorManager()

def prepare_unbiased_dataset(
    df: pd.DataFrame, 
    indicator_manager,
    indicator_timeframe: str = '1H',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Prepare dataset with technical indicators calculated without look-ahead bias,
    using rolling window aggregation.

    Args:
        df: DataFrame with 1-minute OHLC data and UTC timezone index
        indicator_manager: IndicatorManager instance
        indicator_timeframe: Timeframe for indicator calculation (e.g., '1H', '4H', '1D')
        verbose: Whether to print progress information

    Returns:
        DataFrame with 5-minute candles and indicators calculated at specified timeframe
    """
    if verbose:
        print("Starting data preparation...")
    
    # Ensure UTC timezone
    if df.index.tz is None:
        df.index = df.index.tz_localize('UTC')
    elif df.index.tz != pytz.UTC:
        df.index = df.index.tz_convert('UTC')
    
    # Create 5-minute OHLC data
    df_5min = df.resample('5T').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }).dropna()
    
    if verbose:
        print(f"Resampled to 5-minute candles. Shape: {df_5min.shape}")
    
    # Define the number of periods for the indicator timeframe
    timeframe_minutes = pd.Timedelta(indicator_timeframe).total_seconds() / 60
    periods = int(timeframe_minutes / 5)  # Number of 5-minute periods in the indicator timeframe

    if periods < 1:
        raise ValueError("Indicator timeframe is shorter than the data frequency.")
    
    if verbose:
        print(f"Timeframe minutes: {timeframe_minutes}")
        print(f"Periods: {periods}")
    
    # Rolling window aggregation functions
    def rolling_agg(rolling_obj):
        return pd.DataFrame({
            'open': rolling_obj['open'].apply(lambda x: x.iloc[0]),
            'high': rolling_obj['high'].max(),
            'low': rolling_obj['low'].min(),
            'close': rolling_obj['close'].apply(lambda x: x.iloc[-1])
        })
    
    # Perform rolling window aggregation
    rolling_windows = df_5min.rolling(window=periods, min_periods=periods)

    # Apply the rolling aggregation
    period_data = rolling_windows.apply(
        lambda x: pd.Series({
            'open': x['open'].iloc[0],
            'high': x['high'].max(),
            'low': x['low'].min(),
            'close': x['close'].iloc[-1]
        }),
        raw=False
    )

    # Drop initial NaNs
    period_data.dropna(inplace=True)

    if verbose:
        print(f"Aggregated period data. Shape: {period_data.shape}")
    
    # Calculate indicators on the rolling aggregated data
    indicators_df = indicator_manager.calculate_indicators(
        period_data,
        indicator_timeframe=None  # Since data is already aggregated
    )

    # Combine indicators with the 5-minute data
    result_df = df_5min.join(indicators_df, how='inner')

    if verbose:
        print(f"Final dataset prepared. Shape: {result_df.shape}")
        print(f"Date range: {result_df.index[0]} to {result_df.index[-1]}")
    
    return result_df

# Main processing loop
def process_currency_pairs(
    currencies: List[str],
    base_path: str = '/Volumes/ssd_fat2/ai6_trading_bot/datasets',
    indicator_timeframe: str = '1H'
) -> None:
    """
    Process multiple currency pairs with unbiased indicator calculation.
    
    Args:
        currencies: List of currency pairs to process
        base_path: Base path for data storage
        indicator_timeframe: Timeframe for indicator calculation
    """
    for ccy in currencies:
        print(f"\nProcessing {ccy}...")
        source = f'{base_path}/1min/{ccy}.parquet'
        
        try:
            # Read source data
            df = pd.read_parquet(source)
            print(df.head())
            
            # Prepare dataset with unbiased indicators
            df_with_indicators = prepare_unbiased_dataset(
                df=df,
                indicator_manager=indicator_manager,
                indicator_timeframe=indicator_timeframe,
                verbose=True
            )
            
            # Normalize the data
            print("\nNormalizing data...")
            df_norm = processor.normalize_simple(df=df_with_indicators)
            
            # Save results
            output_path = f'{base_path}/5min/unbiased/{ccy}_5T_indics_{indicator_timeframe}_norm_unbiased.parquet'
            print(f"Saving to {output_path}")
            df_norm.to_parquet(output_path)
            
            print(f"Completed processing {ccy}")
            print(df_norm)
            return df_norm  # Return for inspection
            
        except Exception as e:
            print(f"Error processing {ccy}: {str(e)}")
            continue

# Example usage
currencies = ['EUR_USD']  # Add more pairs as needed

df = process_currency_pairs(
    currencies=currencies,
    indicator_timeframe='1H'  # or '1D' for daily indicators
)

df  # For inspection


Row-by-row approach

In [None]:


import pandas as pd
import numpy as np
from tqdm import tqdm
import pytz

def prepare_unbiased_dataset_row_by_row(
    df: pd.DataFrame, 
    indicator_manager,
    indicator_timeframe: str = '1h',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Prepare dataset with technical indicators calculated without look-ahead bias,
    processing data row by row.

    Args:
        df: DataFrame with 1-minute OHLC data and UTC timezone index
        indicator_manager: IndicatorManager instance
        indicator_timeframe: Timeframe to aggregate data for indicator calculation (e.g., '1h', '4h', '1d')
        verbose: Whether to print progress information

    Returns:
        DataFrame with 5-minute candles and indicators calculated at specified timeframe
    """
    if verbose:
        print("Starting data preparation using row-by-row method...")

    # Ensure UTC timezone
    if df.index.tz is None:
        df.index = df.index.tz_localize('UTC')
    elif df.index.tz != pytz.UTC:
        df.index = df.index.tz_convert('UTC')

    # Create 5-minute OHLC data
    df_5min = df.resample('5min').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }).dropna()

    if verbose:
        print(f"Resampled to 5-minute candles. Shape: {df_5min.shape}")

    # Convert indicator_timeframe to minutes
    timeframe_minutes = int(pd.Timedelta(indicator_timeframe).total_seconds() / 60)

    # Get maximum periods required by indicators
    max_indicator_periods = 100_000

    if verbose:
        print(f"Maximum indicator periods required: {max_indicator_periods}")

    # Initialize list to collect results
    results = []

    # Progress bar setup
    if verbose:
        iterator = tqdm(df_5min.iterrows(), total=len(df_5min), desc='Processing rows')
    else:
        iterator = df_5min.iterrows()

    # Initialize a DataFrame to cache data for the rolling window
    data_cache = pd.DataFrame(columns=['open', 'high', 'low', 'close'])

    for idx, row in iterator:
        # Append the current row to the cache
        data_cache.loc[idx] = row

        # Remove data older than necessary for the indicator calculations
        earliest_time = idx - pd.Timedelta(minutes=(timeframe_minutes * max_indicator_periods))
        print(f"Earliest time: {earliest_time}")
        data_cache = data_cache.loc[data_cache.index >= earliest_time]

        # Get data up to the current time for indicator calculation
        data_up_to_now = data_cache.loc[:idx]

        # Aggregate data to the indicator timeframe up to the current time
        period_data = data_up_to_now.resample(indicator_timeframe, closed='right', label='right').agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last'
        }).dropna()

        # Check if we have enough periods in period_data for indicators
        if len(period_data) < max_indicator_periods:
            # Not enough data yet to calculate indicators
            continue

        # Calculate indicators on period_data
        indicators_df = indicator_manager.calculate_indicators(
            period_data,
            indicator_timeframe=None  # Data is already aggregated
        )

        if indicators_df.empty:
            # Not enough data, skip
            continue

        # Get the last row of indicators
        try:
            indicators = indicators_df.iloc[-1]
        except IndexError:
            # indicators_df is empty
            continue

        # Combine the current 5-minute data with indicators
        combined_row = pd.concat([row, indicators])

        # Append combined_row to results
        results.append(combined_row)

    # Create final DataFrame
    result_df = pd.DataFrame(results)

    # Drop any rows with NaN values in the indicators (e.g., initial periods)
    result_df.dropna(inplace=True)

    if verbose and not result_df.empty:
        print(f"\nFinal dataset prepared. Shape: {result_df.shape}")
        print(f"Date range: {result_df.index[0]} to {result_df.index[-1]}")
    elif verbose:
        print("\nNo data was processed. Please check if there is sufficient data for indicator calculation.")

    return result_df


def process_currency_pairs(
    currencies: List[str],
    base_path: str = '/Volumes/ssd_fat2/ai6_trading_bot/datasets',
    indicator_timeframe: str = '1h'
) -> pd.DataFrame:
    """
    Process multiple currency pairs with unbiased indicator calculation using row-by-row method.

    Args:
        currencies: List of currency pairs to process
        base_path: Base path for data storage
        indicator_timeframe: Timeframe for indicator calculation

    Returns:
        Processed DataFrame for inspection
    """
    for ccy in currencies:
        print(f"\nProcessing {ccy}...")
        source = f'{base_path}/1min/{ccy}.parquet'

        try:
            # Read source data
            df = pd.read_parquet(source)

            # Prepare dataset with unbiased indicators
            df_with_indicators = prepare_unbiased_dataset_row_by_row(
                df=df,
                indicator_manager=indicator_manager,
                indicator_timeframe=indicator_timeframe,
                verbose=True
            )

            if df_with_indicators.empty:
                print(f"No data processed for {ccy}. Skipping.")
                continue

            # Normalize the data
            print("\nNormalizing data...")
            df_norm = processor.normalize_simple(df=df_with_indicators)

            # Save results
            output_path = f'{base_path}/5min/unbiased/{ccy}_5min_indics_{indicator_timeframe}_norm_unbiased.parquet'
            print(f"Saving to {output_path}")
            df_norm.to_parquet(output_path)

            print(f"Completed processing {ccy}")
            return df_norm  # Return for inspection

        except Exception as e:
            print(f"Error processing {ccy}: {str(e)}")
            continue

# Now you can run the processing function
currencies = ['EUR_USD']
df = process_currency_pairs(
    currencies=currencies,
    indicator_timeframe='1h'
)

df  # For inspection



In [None]:
def get_max_indicator_periods(indicator_params):
    """Calculate the maximum number of periods required by all indicators."""
    max_period = 0
    for indicator, params in indicator_params.items():
        if indicator == 'sma':
            max_period = max(max_period, max(params.get('periods', [0])))
        elif indicator == 'rsi':
            max_period = max(max_period, params.get('period', 0))
        elif indicator == 'macd':
            max_period = max(max_period, params.get('slowperiod', 0))
        elif indicator == 'bollinger':
            max_period = max(max_period, params.get('timeperiod', 0))
        elif indicator == 'atr':
            max_period = max(max_period, params.get('period', 0))
        elif indicator == 'adx':
            max_period = max(max_period, params.get('period', 0))
        elif indicator == 'dmi':
            max_period = max(max_period, params.get('period', 0))
        elif indicator == 'ichimoku':
            # Ichimoku uses standard periods of 9, 26, and 52
            max_period = max(max_period, 52)
    return max_period

max_indicator_periods = get_max_indicator_periods(indicator_manager.indicator_params)

max_indicator_periods

In [None]:
df

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import pytz

from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager
from visualization.chart_manager import ChartManager
from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()
dataset_manager = DatasetManager()
chart_manager = ChartManager()
indicator_manager = IndicatorManager()


import pandas as pd
import numpy as np

# Read your 1-minute data and resample to 5-minute candles
source = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/EUR_USD.parquet'
df = pd.read_parquet(source)

if df.index.tz is None:
    df.index = df.index.tz_localize('UTC')

minute_5 = df.resample('5min').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last'
}).dropna()

# Resample the 5-minute data into hourly candles, up to each time t
hourly_data = minute_5.resample('1H', closed='right', label='right').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last'
}).dropna()

# Calculate technical indicators on the hourly data
# (Assuming `indicator_manager.calculate_indicators` returns a DataFrame or Series)
indicators_df = indicator_manager.calculate_indicators(hourly_data, indicator_timeframe='1h')

# Map the indicator values to the 5-minute DataFrame, starting from time t onwards
# This avoids using future data and eliminates forward-looking bias
# Initialize result DataFrame
result_df = df_5min.copy()

# Map each indicator back to 5-minute data
for column in indicators_df.columns:
    result_df[column] = indicators_df[column].reindex(
        df_5min.index, method='ffill'
    )
    
    # Mask values before their calculation time
    result_df[column] = result_df.apply(
        lambda row: row[column] if row.name >= indicators_df.index.min() else np.nan,
        axis=1
        )

result_df

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import pytz
from typing import Dict, List, Optional

def prepare_unbiased_dataset(
    df: pd.DataFrame, 
    indicator_manager,
    timeframe: str = '1H',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Prepare dataset with technical indicators calculated without look-ahead bias.
    
    Args:
        df: DataFrame with 1-minute OHLC data and UTC timezone index
        indicator_manager: IndicatorManager instance
        timeframe: Timeframe for indicator calculation (e.g., '1H', '4H', '1D')
        verbose: Whether to print progress information
    
    Returns:
        DataFrame with 5-minute candles and indicators calculated at specified timeframe
    """
    if verbose:
        print("Starting data preparation...")
    
    # Ensure UTC timezone
    if df.index.tz is None:
        df.index = df.index.tz_localize('UTC')
    elif df.index.tz != pytz.UTC:
        df.index = df.index.tz_convert('UTC')
    
    # Create 5-minute OHLC data
    df_5min = df.resample('5min').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }).dropna()
    
    if verbose:
        print(f"Resampled to 5-minute candles. Shape: {df_5min.shape}")
    
    # Create timeframe data for indicator calculation (e.g., hourly)
    period_data = df_5min.resample(timeframe, closed='right', label='right').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }).dropna()
    
    if verbose:
        print(f"Resampled to {timeframe} for indicator calculation. Shape: {period_data.shape}")
    
    # Calculate indicators
    indicators_df = indicator_manager.calculate_indicators(
        period_data, 
        indicator_timeframe=timeframe
    )
    
    if verbose:
        print(f"Calculated indicators. Available columns: {indicators_df.columns.tolist()}")
    
    # Initialize result DataFrame
    result_df = df_5min.copy()
    
    # Forward fill indicators to 5-minute data
    for column in indicators_df.columns:
        # Reindex and forward fill
        result_df[column] = indicators_df[column].reindex(
            df_5min.index, 
            method='ffill'
        )
        
        # Mask values before their first calculation time
        mask = result_df.index >= indicators_df.index.min()
        result_df.loc[~mask, column] = np.nan
    
    # Validate results
    if verbose:
        nan_counts = result_df[indicators_df.columns].isna().sum()
        if nan_counts.any():
            print("\nNaN values in indicators:")
            print(nan_counts[nan_counts > 0])
        
        print(f"\nFinal shape: {result_df.shape}")
        print(f"Date range: {result_df.index[0]} to {result_df.index[-1]}")
    
    return result_df

# Main processing loop
def process_currency_pairs(
    currencies: List[str],
    base_path: str = '/Volumes/ssd_fat2/ai6_trading_bot/datasets',
    indicator_timeframe: str = '1H'
) -> None:
    """
    Process multiple currency pairs with unbiased indicator calculation.
    
    Args:
        currencies: List of currency pairs to process
        base_path: Base path for data storage
        indicator_timeframe: Timeframe for indicator calculation
    """
    for ccy in currencies:
        print(f"\nProcessing {ccy}...")
        source = f'{base_path}/1min/{ccy}.parquet'
        
        try:
            # Read source data
            df = pd.read_parquet(source)
            
            # Prepare dataset with unbiased indicators
            df_with_indicators = prepare_unbiased_dataset(
                df=df,
                indicator_manager=indicator_manager,
                timeframe=indicator_timeframe,
                verbose=True
            )
            
            # Normalize the data
            print("\nNormalizing data...")
            df_norm = processor.normalize_simple(df=df_with_indicators)
            
            # Save results
            output_path = f'{base_path}/5min/unbiased/{ccy}_5T_indics_{indicator_timeframe}_norm_unbiased.parquet'
            print(f"Saving to {output_path}")
            df_norm.to_parquet(output_path)
            
            print(f"Completed processing {ccy}")
            return df_norm
            
        except Exception as e:
            print(f"Error processing {ccy}: {str(e)}")
            continue

# Example usage
currencies = ['EUR_USD']  # Add more pairs as needed

df = process_currency_pairs(
    currencies=currencies,
    indicator_timeframe='1H'  # or '1D' for daily indicators
)

df

In [None]:
import pandas as pd

df = pd.read_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/unbiased/EUR_USD_5T_indics_1D_norm_ubiased.parquet')
df

In [None]:
import pandas as pd

df = pd.read_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/best_dataframes/CHF_JPY_5T_indics_1H_norm.parquet')
df.columns

In [None]:
import os
import sys

import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager
from visualization.chart_manager import ChartManager
from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()
dataset_manager = DatasetManager()
chart_manager = ChartManager()
indicator_manager = IndicatorManager()

currencies = [
            'GBP_CHF', 'GBP_JPY', 'EUR_CHF', 'EUR_JPY', 'USD_CHF',
            'EUR_CAD', 'EUR_USD', 'GBP_USD', 'EUR_GBP', 'USD_JPY',
            'USD_CAD', 'AUD_USD', 'CHF_JPY', 'AUD_JPY', 'NZD_USD',
            'NZD_JPY', 'XAU_USD', 'XAG_USD'
        ]
for ccy in currencies:
    source = f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/{ccy}.parquet'
    df = pd.read_parquet(source)
    resampled_df = df.resample('5min').agg({
                        'open': 'first',
                        'high': 'max',
                        'low': 'min',
                        'close': 'last',
                    }).dropna()
    # Initialize result DataFrame
    result_df = resampled_df.copy()
    
    # Group by date to process one day at a time
    for date in resampled_df.index.date.unique():
        day_data = resampled_df[resampled_df.index.date == date]
        
        # For each 5-minute candle in the day
        for timestamp in day_data.index:
            # Get only the data available up to this point
            available_data = resampled_df[resampled_df.index <= timestamp]
            
            # Calculate daily candle using only available data
            available_daily = available_data.resample('D').agg({
                'open': 'first',
                'high': 'max',
                'low': 'min',
                'close': 'last',
                'volume': 'sum'
            })
            
            # Calculate indicators using only available data
            indicators = indicator_manager.calculate_indicators(available_daily)
            
            # Store the indicators for this specific timestamp
            # result_df.loc[timestamp, indicator_columns] = indicators.iloc[-1]
    df_norm = processor.normalize_simple(df=df_with_indicators)
    df_with_indicators.to_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/EUR_USD_5T_indics_1D.parquet')
    df_with_indicators
    
    
    df_with_indicators = indicator_manager.calculate_indicators(resampled_df, indicator_timeframe='1D')
    # df_with_indicators.dropna(inplace=True)

    # print(df_with_indicators)
    df_norm = processor.normalize_simple(df=df_with_indicators)
    df_with_indicators.to_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/5min/EUR_USD_5T_indics_1D.parquet')
    df_with_indicators

In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path


# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager

df_not_norm_dual_indic = '/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/EUR_USD_1H_indics_1D_and_1h.parquet'


df = pd.read_parquet(df_not_norm_dual_indic)
dataset_manager = DatasetManager()
train_df, val_df, test_df = dataset_manager.split_dataset(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)



saving_path = f'./logs/29nov/no_norm_1H_and_1D_dual_indic/'
os.makedirs(saving_path, exist_ok=True)

def make_train_env():
    env = ForexTradingEnv(
        df=train_df,
        pair='EUR_USD',

    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    return env

def make_eval_env():
    env = ForexTradingEnv(

        df=val_df,
        pair='EUR_USD',
        # resample_interval='1h'
    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env

train_env = make_train_env()
eval_env = make_eval_env()
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=saving_path,
    log_path=saving_path,
    eval_freq=50_000,  # Adjust as needed
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

model = PPO(
    'MlpPolicy',
    train_env,
    verbose=0,
    tensorboard_log=f'{saving_path}tensorboard/',
)

model.learn(
    total_timesteps=3_000_000,  # Adjust as needed
    callback=eval_callback
)

model.save(f'{saving_path}best_model.zip')
train_env.save(f'{saving_path}vec_normalize.pkl')

EUR norm Hstack

In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path


# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager

pair = "EUR_USD"

eur_norm = Path('/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/normalized/EUR_USD.parquet')
df = pd.read_parquet(eur_norm)

dataset_manager = DatasetManager()
train_df, val_df, test_df = dataset_manager.split_dataset(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)



saving_path = f'./logs/27nov/norm_hstack/'
os.makedirs(saving_path, exist_ok=True)

def make_train_env():
    env = ForexTradingEnv(
        df=train_df,
        pair='EUR_USD',

    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    return env

def make_eval_env():
    env = ForexTradingEnv(

        df=val_df,
        pair='EUR_USD',
        # resample_interval='1h'
    )
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env.training = False
    return env

train_env = make_train_env()
eval_env = make_eval_env()
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=saving_path,
    log_path=saving_path,
    eval_freq=100_000,  # Adjust as needed
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

model = PPO(
    'MlpPolicy',
    train_env,
    verbose=0,
    tensorboard_log=f'{saving_path}tensorboard/',
)

model.learn(
    total_timesteps=5_000_000,  # Adjust as needed
    callback=eval_callback
)

model.save(f'{saving_path}best_model.zip')
train_env.save(f'{saving_path}vec_normalize.pkl')

In [None]:
import os
import sys

import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path


# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from trading.environments.forex_env2_flat import ForexTradingEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from data_management.dataset_manager import DatasetManager

pair = "EUR_USD"

eur_norm = Path('/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/normalized/EUR_USD.parquet')
df = pd.read_parquet(eur_norm)
df.columns

In [None]:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pathlib import Path
import pandas as pd
import os, sys
output_dir= "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/normalized/"
Path(output_dir)
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


from data_management.preprocessor import DataPreprocessor



processor = DataPreprocessor()

eur = pd.read_parquet("/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/EUR_USD.parquet")

eur_norm = processor.normalize_simple(df=eur)
eur_norm.to_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/normalized/EUR_USD.parquet')
eur_norm


In [None]:
eur_norm.to_parquet('/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h/normalized/EUR_USD.parquet')