In [1]:

import os
import sys
import pandas as pd
import numpy as np
import pytz
from typing import List, Optional
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from unbiased_data import process_currency_pairs, prepare_unbiased_dataset_row_by_row

from data_management.indicator_manager import IndicatorManager
from data_management.preprocessor import DataPreprocessor

indicator_manager = IndicatorManager()
processor = DataPreprocessor()

import logging


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dataset_prep.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('dataset_prep')



currencies_1 = [
            'GBP_CHF', 'GBP_JPY', 'EUR_CHF', 
 
        ]
currencies_2 = [

            'EUR_CAD', 'EUR_USD', 'GBP_USD', 
    
        ]
currencies_3 = [

            'USD_CAD', 'AUD_USD', 'CHF_JPY', 
 
        ]
currencies_4 = [

            'NZD_JPY', 'XAU_USD', 'XAG_USD', 
        ]
currencies_5 = [

            'USD_CHF', 'USD_JPY', 'AUD_JPY', 
        ]
currencies_6 = [

            'EUR_JPY', 'EUR_GBP', 'NZD_USD',
        ]

for ccy in currencies_5:

    logger.info(f'Starting processing for {ccy} at {pd.Timestamp.now()}')
    df = pd.read_parquet(f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/{ccy}.parquet')
    df_with_indicators = prepare_unbiased_dataset_row_by_row(
                df=df,
                indicator_manager=indicator_manager,
                indicator_timeframe='1h',
                verbose=True
            )
    df_with_indicators = df_with_indicators.dropna()
    
    output_path_not_norm = f'./{ccy}_5min_1H_indic_not_norm_unbiased.parquet'
    df_with_indicators.to_parquet(output_path_not_norm)
    
    df_norm = processor.normalize_simple(df=df_with_indicators)
    
    output_path = f'./{ccy}_5min_1H_norm_unbiased.parquet'
    df_norm.to_parquet(output_path)
    
    logger.info(f'Finished processing for {ccy} at {pd.Timestamp.now()}')
 

2024-12-06 16:18:59,828 - dataset_prep - INFO - Starting processing for USD_CHF at 2024-12-06 16:18:59.828006
2024-12-06 16:19:00,315 - dataset_prep - INFO - Starting data preparation using row-by-row method...
2024-12-06 16:19:00,956 - dataset_prep - INFO - Resampled to 5-minute candles. Shape: (1753577, 4)
Processing rows: 100%|██████████| 1753577/1753577 [2:58:10<00:00, 164.03it/s]  
2024-12-06 19:26:46,996 - dataset_prep - INFO - 
Final dataset prepared. Shape: (1753577, 23)
2024-12-06 19:26:47,073 - dataset_prep - INFO - Date range: 2001-01-02 23:00:00+00:00 to 2024-11-28 10:15:00+00:00
2024-12-06 19:26:47,443 - dataset_prep - INFO - 
Percentage of NaN values in indicator columns:
2024-12-06 19:26:47,444 - dataset_prep - INFO - sma_20: 0.01%
2024-12-06 19:26:47,444 - dataset_prep - INFO - sma_50: 0.03%
2024-12-06 19:26:47,445 - dataset_prep - INFO - rsi: 0.01%
2024-12-06 19:26:47,494 - dataset_prep - INFO - macd: 0.02%
2024-12-06 19:26:47,495 - dataset_prep - INFO - macd_signal: 0

IndexError: single positional indexer is out-of-bounds