In [None]:
import os
import sys

import pandas as pd
import numpy as np
import pytz
from typing import List, Optional

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from stable_baselines3 import PPO

from data_management.indicator_manager import IndicatorManager

from data_management.preprocessor import DataPreprocessor

processor = DataPreprocessor()


indicator_manager = IndicatorManager()
import logging

from tqdm import tqdm

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dataset_prep.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('dataset_prep')


def prepare_unbiased_dataset_row_by_row(
    df_5min: pd.DataFrame, 
    indicator_manager,
    indicator_timeframe: str = '1h',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Prepare dataset with technical indicators calculated without look-ahead bias,
    processing data row by row.

    Args:
        df: DataFrame with 1-minute OHLC data and UTC timezone index
        indicator_manager: IndicatorManager instance
        indicator_timeframe: Timeframe to aggregate data for indicator calculation (e.g., '1h', '4h', '1d')
        verbose: Whether to print progress information

    Returns:
        DataFrame with 5-minute candles and indicators calculated at specified timeframe
    """
    if verbose:
        logger.info("Starting data preparation using row-by-row method...")

    # Ensure UTC timezone
    if df_5min.index.tz is None:
        df_5min.index = df_5min.index.tz_localize('UTC')
    elif df_5min.index.tz != pytz.UTC:
        df_5min.index = df_5min.index.tz_convert('UTC')

    # Create 5-minute OHLC data
    # df_5min = df.resample('5min').agg({
    #     'open': 'first',
    #     'high': 'max',
    #     'low': 'min',
    #     'close': 'last'
    # }).dropna()
    # df_5min = df

    if verbose:
        logger.info(f"Resampled to 5-minute candles. Shape: {df_5min.shape}")

    # Convert indicator_timeframe to minutes
    timeframe_minutes = int(pd.Timedelta(indicator_timeframe).total_seconds() / 60)

    # Get maximum periods required by indicators
    max_indicator_periods = 60

    if verbose:
        logger.info(f"Maximum indicator periods required: {max_indicator_periods}")

    # Initialize list to collect results
    results = []

    # Progress bar setup
    if verbose:
        iterator = tqdm(df_5min.iterrows(), total=len(df_5min), desc='Processing rows')
    else:
        iterator = df_5min.iterrows()

    # Initialize a DataFrame to cache data for the rolling window
    data_cache = pd.DataFrame(columns=['open', 'high', 'low', 'close'])

    for idx, row in iterator:
        # Append the current row to the cache
        data_cache.loc[idx] = row

        # Remove data older than necessary for the indicator calculations
        earliest_time = idx - pd.Timedelta(minutes=(timeframe_minutes * max_indicator_periods))
   
        data_cache = data_cache.loc[data_cache.index >= earliest_time]

        # Get data up to the current time for indicator calculation
        data_up_to_now = data_cache.loc[:idx]

        # Aggregate data to the indicator timeframe up to the current time
        period_data = data_up_to_now.resample(indicator_timeframe, closed='right', label='right').agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last'
        }).dropna()

        # Check if we have enough periods in period_data for indicators
        if len(period_data) < max_indicator_periods:
            # Not enough data yet to calculate indicators
            continue

        # Calculate indicators on period_data
        indicators_df = indicator_manager.calculate_indicators(
            period_data,
            indicator_timeframe=None  # Data is already aggregated
        )

        if indicators_df.empty:
            # Not enough data, skip
            continue

        # Get the last row of indicators
        try:
            indicators = indicators_df.iloc[-1]
        except IndexError:
            # indicators_df is empty
            continue

        # Combine the current 5-minute data with indicators
        combined_row = pd.concat([row, indicators])

        # Append combined_row to results
        results.append(combined_row)

    # Create final DataFrame
    result_df = pd.DataFrame(results)

    # Drop any rows with NaN values in the indicators (e.g., initial periods)
    result_df.dropna(inplace=True)

    if verbose and not result_df.empty:
        logger.info(f"\nFinal dataset prepared. Shape: {result_df.shape}")
        logger.info(f"Date range: {result_df.index[0]} to {result_df.index[-1]}")
    elif verbose:
        logger.info("\nNo data was processed. Please check if there is sufficient data for indicator calculation.")

    return result_df


def process_currency_pairs(
    currencies: List[str],
    base_path: str = './',
    indicator_timeframe: str = '1h'
) -> pd.DataFrame:
    """
    Process multiple currency pairs with unbiased indicator calculation using row-by-row method.

    Args:
        currencies: List of currency pairs to process
        base_path: Base path for data storage
        indicator_timeframe: Timeframe for indicator calculation

    Returns:
        Processed DataFrame for inspection
    """
    for ccy in currencies:
        logger.info(f"\nProcessing {ccy}...")
        source_5min = f'../deployment/raw_data/{ccy}_raw_5min.parquet'

        try:
            # Read source data
            df = pd.read_parquet(source_5min)

            # Prepare dataset with unbiased indicators
            df_with_indicators = prepare_unbiased_dataset_row_by_row(
                df_5min=df,
                indicator_manager=indicator_manager,
                indicator_timeframe=indicator_timeframe,
                verbose=True
            )

            if df_with_indicators.empty:
                logger.info(f"No data processed for {ccy}. Skipping.")
                continue
            
            output_path_not_norm = f'{base_path}/{ccy}_5min_indics_{indicator_timeframe}_not_norm_unbiased.parquet'
            df_with_indicators.to_parquet(output_path_not_norm)
            # Normalize the data
            logger.info("\nNormalizing data...")
            df_norm = processor.normalize_simple(df=df_with_indicators)

            # Save results
            output_path = f'{base_path}/{ccy}_5min_indics_{indicator_timeframe}_norm_unbiased.parquet'
          
            df_norm.to_parquet(output_path)

            logger.info(f"Completed processing {ccy}")
            return df_norm  # Return for inspection

        except Exception as e:
            logger.info(f"Error processing {ccy}: {str(e)}")
            continue


currencies_1 = [
            'GBP_CHF', 'GBP_JPY', 'EUR_CHF', 
 
        ]
currencies_2 = [

            'EUR_CAD', 'EUR_USD', 'GBP_USD', 
    
        ]
currencies_3 = [

            'USD_CAD', 'AUD_USD', 'CHF_JPY', 
 
        ]
currencies_4 = [

            'NZD_JPY', 'XAU_USD', 'XAG_USD', 
        ]
currencies_5 = [

            'USD_CHF', 'USD_JPY', 'AUD_JPY', 
        ]
currencies_6 = [

            'EUR_JPY', 'EUR_GBP', 'NZD_USD',
        ]


df = process_currency_pairs(
    currencies=currencies_2,
    indicator_timeframe='1h'
)

# df  # For inspection



In [None]:
# def get_max_indicator_periods(indicator_params):
#     """Calculate the maximum number of periods required by all indicators."""
#     max_period = 0
#     for indicator, params in indicator_params.items():
#         if indicator == 'sma':
#             max_period = max(max_period, max(params.get('periods', [0])))
#         elif indicator == 'rsi':
#             max_period = max(max_period, params.get('period', 0))
#         elif indicator == 'macd':
#             max_period = max(max_period, params.get('slowperiod', 0))
#         elif indicator == 'bollinger':
#             max_period = max(max_period, params.get('timeperiod', 0))
#         elif indicator == 'atr':
#             max_period = max(max_period, params.get('period', 0))
#         elif indicator == 'adx':
#             max_period = max(max_period, params.get('period', 0))
#         elif indicator == 'dmi':
#             max_period = max(max_period, params.get('period', 0))
#         elif indicator == 'ichimoku':
#             # Ichimoku uses standard periods of 9, 26, and 52
#             max_period = max(max_period, 52)
#     return max_period

# max_indicator_periods = get_max_indicator_periods(indicator_manager.indicator_params)

# max_indicator_periods

50