In [1]:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pathlib import Path
output_dir= "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h"
Path(output_dir)

PosixPath('/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h')

In [8]:
#!/usr/bin/env python3
# Add the project root to the Python path so we can import our modules
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import logging
from pathlib import Path
from datetime import datetime, timedelta
import pytz
import pandas as pd
from tqdm import tqdm
import sys

from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager

dataset_manager = DatasetManager()

def load_and_process_data(
    ticker: str = "EUR_USD",
    timeframe: str = "1h",
    days_back: int = 30,

) -> pd.DataFrame:
    """
    Load and process data for a given ticker.
    
    Args:
        ticker: Currency pair to process
        timeframe: Timeframe for the data
        days_back: Number of days of data to load
    """
    end_time = datetime.now(pytz.UTC)
    start_time = end_time - timedelta(days=days_back)
    
    print(f"Loading and updating data for {ticker}...")
    print(f"Timeframe: {timeframe}")
    print(f"Date range: {start_time} to {end_time}")
    
    df = dataset_manager.load_and_update_dataset(
        currency_pair=ticker,
        timeframe="1min",  # Always load 1-min data first
        start_time=start_time,
        end_time=end_time,
        normalize=False,
    )
    
    print("no fetching, move to calculate indicators for:", ticker)
    
    # Aggregate to 1-hour timeframe
    print(f"Move to normalize for: {ticker}")
    
    # Resample to 1-hour timeframe
    hourly_df = df.resample('1H').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    })
    
    # Drop NaN values instead of forward filling
    hourly_df.dropna(inplace=True)
    
    print("\nData Summary:")
    print(f"Data range: {hourly_df.index[0]} to {hourly_df.index[-1]}")
    print(f"Total rows: {len(hourly_df)}")
    print("\nColumns available:")
    print(hourly_df.columns.tolist())

    output_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h"
    output_path = Path(output_dir) / f"{ticker}.parquet"
    hourly_df.to_parquet(output_path)
    
    return hourly_df


# In Jupyter notebook, you would use it like this:

ticker_list = [

            'USD_JPY',
 
            'XAU_USD',
            'XAG_USD']

for ticker in ticker_list:
    load_and_process_data(ticker=ticker, timeframe="1h", days_back=100_000)


Loading and updating data for USD_JPY...
Timeframe: 1h
Date range: 1751-02-03 22:07:41.258872+00:00 to 2024-11-18 22:07:41.258872+00:00
end_time is 2024-11-18 22:07:41.258872+00:00 
{'instrument': 'USD_JPY', 'granularity': 'M1', 'candles': [{'complete': True, 'volume': 136, 'time': '2024-10-30T23:59:00.000000000Z', 'mid': {'o': '153.244', 'h': '153.250', 'l': '153.235', 'c': '153.240'}}, {'complete': True, 'volume': 497, 'time': '2024-10-31T00:00:00.000000000Z', 'mid': {'o': '153.240', 'h': '153.240', 'l': '153.193', 'c': '153.218'}}, {'complete': True, 'volume': 331, 'time': '2024-10-31T00:01:00.000000000Z', 'mid': {'o': '153.217', 'h': '153.257', 'l': '153.210', 'c': '153.211'}}, {'complete': True, 'volume': 276, 'time': '2024-10-31T00:02:00.000000000Z', 'mid': {'o': '153.210', 'h': '153.222', 'l': '153.194', 'c': '153.196'}}, {'complete': True, 'volume': 290, 'time': '2024-10-31T00:03:00.000000000Z', 'mid': {'o': '153.194', 'h': '153.200', 'l': '153.170', 'c': '153.174'}}, {'complet

  hourly_df = df.resample('1H').agg({



Data Summary:
Data range: 2001-01-02 23:00:00+00:00 to 2024-11-18 22:00:00+00:00
Total rows: 146860

Columns available:
['open', 'high', 'low', 'close', 'volume']


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-01-02 23:00:00+00:00,114.430,114.450,114.370,114.430,0.0
2001-01-03 00:00:00+00:00,114.420,114.470,114.360,114.440,0.0
2001-01-03 01:00:00+00:00,114.440,114.450,114.390,114.440,0.0
2001-01-03 02:00:00+00:00,114.440,114.580,114.440,114.560,0.0
2001-01-03 03:00:00+00:00,114.560,114.580,114.520,114.580,0.0
...,...,...,...,...,...
2024-11-18 18:00:00+00:00,154.704,154.792,154.593,154.631,11303.0
2024-11-18 19:00:00+00:00,154.630,154.758,154.574,154.740,11535.0
2024-11-18 20:00:00+00:00,154.742,154.743,154.592,154.594,7447.0
2024-11-18 21:00:00+00:00,154.594,154.688,154.569,154.685,6427.0


Loading and updating data for XAU_USD...
Timeframe: 1h
Date range: 1751-02-03 22:08:13.013799+00:00 to 2024-11-18 22:08:13.013799+00:00
end_time is 2024-11-18 22:08:13.013799+00:00 
{'instrument': 'XAU_USD', 'granularity': 'M1', 'candles': [{'complete': True, 'volume': 172, 'time': '2024-10-30T23:59:00.000000000Z', 'mid': {'o': '2785.615', 'h': '2785.630', 'l': '2785.285', 'c': '2785.510'}}, {'complete': True, 'volume': 159, 'time': '2024-10-31T00:00:00.000000000Z', 'mid': {'o': '2785.540', 'h': '2785.795', 'l': '2785.510', 'c': '2785.630'}}, {'complete': True, 'volume': 118, 'time': '2024-10-31T00:01:00.000000000Z', 'mid': {'o': '2785.580', 'h': '2785.650', 'l': '2785.255', 'c': '2785.305'}}, {'complete': True, 'volume': 146, 'time': '2024-10-31T00:02:00.000000000Z', 'mid': {'o': '2785.300', 'h': '2785.415', 'l': '2785.000', 'c': '2785.090'}}, {'complete': True, 'volume': 188, 'time': '2024-10-31T00:03:00.000000000Z', 'mid': {'o': '2785.095', 'h': '2785.290', 'l': '2784.905', 'c': '27

  hourly_df = df.resample('1H').agg({



Data Summary:
Data range: 2001-01-02 23:00:00+00:00 to 2024-11-18 21:00:00+00:00
Total rows: 146152

Columns available:
['open', 'high', 'low', 'close', 'volume']


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-01-02 23:00:00+00:00,268.800,268.900,268.600,268.60,0.0
2001-01-03 00:00:00+00:00,268.900,269.300,268.900,269.10,0.0
2001-01-03 01:00:00+00:00,269.100,269.400,269.100,269.40,0.0
2001-01-03 02:00:00+00:00,269.300,269.500,269.300,269.50,0.0
2001-01-03 03:00:00+00:00,269.500,269.700,269.500,269.60,0.0
...,...,...,...,...,...
2024-11-18 17:00:00+00:00,2611.610,2611.995,2607.380,2610.46,15382.0
2024-11-18 18:00:00+00:00,2610.470,2612.150,2607.785,2611.21,12816.0
2024-11-18 19:00:00+00:00,2611.200,2612.690,2608.435,2609.88,10697.0
2024-11-18 20:00:00+00:00,2609.885,2611.640,2608.315,2610.40,9957.0


Loading and updating data for XAG_USD...
Timeframe: 1h
Date range: 1751-02-03 22:08:40.788714+00:00 to 2024-11-18 22:08:40.788714+00:00
end_time is 2024-11-18 22:08:40.788714+00:00 
{'instrument': 'XAG_USD', 'granularity': 'M1', 'candles': [{'complete': True, 'volume': 10, 'time': '2024-10-30T23:59:00.000000000Z', 'mid': {'o': '33.74500', 'h': '33.74500', 'l': '33.73900', 'c': '33.74500'}}, {'complete': True, 'volume': 43, 'time': '2024-10-31T00:00:00.000000000Z', 'mid': {'o': '33.74250', 'h': '33.74850', 'l': '33.73600', 'c': '33.74850'}}, {'complete': True, 'volume': 24, 'time': '2024-10-31T00:01:00.000000000Z', 'mid': {'o': '33.74500', 'h': '33.74550', 'l': '33.74100', 'c': '33.74150'}}, {'complete': True, 'volume': 21, 'time': '2024-10-31T00:02:00.000000000Z', 'mid': {'o': '33.74400', 'h': '33.74400', 'l': '33.73500', 'c': '33.73850'}}, {'complete': True, 'volume': 20, 'time': '2024-10-31T00:03:00.000000000Z', 'mid': {'o': '33.74100', 'h': '33.74600', 'l': '33.73900', 'c': '33.7390

  hourly_df = df.resample('1H').agg({



Data Summary:
Data range: 2001-01-02 23:00:00+00:00 to 2024-11-18 21:00:00+00:00
Total rows: 145584

Columns available:
['open', 'high', 'low', 'close', 'volume']


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-01-02 23:00:00+00:00,4.5800,4.5800,4.5500,4.5500,0.0
2001-01-03 00:00:00+00:00,4.5500,4.5500,4.5500,4.5500,0.0
2001-01-03 01:00:00+00:00,4.5500,4.5500,4.5500,4.5500,0.0
2001-01-03 02:00:00+00:00,4.5500,4.5500,4.5500,4.5500,0.0
2001-01-03 03:00:00+00:00,4.5500,4.5500,4.5500,4.5500,0.0
...,...,...,...,...,...
2024-11-18 17:00:00+00:00,31.2445,31.2475,31.1320,31.1675,3267.0
2024-11-18 18:00:00+00:00,31.1670,31.1785,31.0145,31.0740,2573.0
2024-11-18 19:00:00+00:00,31.0735,31.1065,31.0505,31.0875,2192.0
2024-11-18 20:00:00+00:00,31.0885,31.1760,31.0845,31.1305,1633.0


In [7]:
df.isna().any()

open      False
high      False
low       False
close     False
volume    False
dtype: bool

In [None]:
#!/usr/bin/env python3
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import logging
from pathlib import Path
from datetime import datetime, timedelta
import pytz
import pandas as pd
from tqdm import tqdm
import sys

from data_management.dataset_manager import DatasetManager
from data_management.indicator_manager import IndicatorManager


def setup_logging() -> logging.Logger:
    """Configure logging for dataset preparation."""
    logger = logging.getLogger('dataset_preparation')
    logger.setLevel(logging.INFO)
    
    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
    logger.addHandler(console_handler)
    
    # File handler
    log_dir = Path('logs')
    log_dir.mkdir(exist_ok=True)
    file_handler = logging.FileHandler(
        log_dir / f'dataset_preparation_{datetime.now():%Y%m%d_%H%M%S}.log'
    )
    file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
    logger.addHandler(file_handler)
    
    return logger


def prepare_datasets(
    output_dir: str = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h",
    start_date: datetime = None,
    end_date: datetime = None
) -> None:
    """
    Prepare and save datasets for all currency pairs.
    
    Args:
        output_dir: Directory to save processed datasets
        start_date: Start date for data (default: 10 years ago)
        end_date: End date for data (default: now)
    """
    logger = setup_logging()
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Initialize managers
    dataset_manager = DatasetManager()
    
    # Set date range
    if end_date is None:
        end_date = datetime.now(pytz.UTC)
    if start_date is None:
        start_date = end_date - timedelta(days=365 * 30)  # 10 years
    
    # Get list of currency pairs
    pairs = dataset_manager.get_currency_pairs()
    logger.info(f"Processing {len(pairs)} currency pairs from {start_date} to {end_date}")
    
    # Process each pair
    for pair in tqdm(pairs, desc="Processing pairs"):
        try:
            logger.info(f"\nProcessing {pair}")
            
            # Load and update data
            logger.info("Loading and updating data...")
            df = dataset_manager.load_and_update_dataset(
                currency_pair=pair,
                timeframe="1min",  # Load 1-minute data first
                start_time=start_date,
                end_time=end_date,
                normalize=False  # No normalization at this stage
            )
            
            if df.empty:
                logger.warning(f"No data available for {pair}")
                continue
            
            logger.info(f"Loaded {len(df)} rows of 1-minute data")
            
            # Resample to 1-hour timeframe
            logger.info("Resampling to 1-hour timeframe...")
            hourly_df = df.resample('1H').agg({
                'open': 'first',
                'high': 'max',
                'low': 'min',
                'close': 'last',
                'volume': 'sum'
            })
            
            # Forward fill any missing values
            hourly_df.fillna(method='ffill', inplace=True)
            
            # Calculate indicators for hourly data
            logger.info("Calculating technical indicators...")
            indicator_manager = IndicatorManager(
                config_path="config",
                cache_dir="cache"
            )
            hourly_df = indicator_manager.calculate_indicators(hourly_df)
            
            # Save processed dataset
            output_file = output_path / f"{pair}.parquet"
            hourly_df.to_parquet(output_file)
            
            logger.info(f"Saved processed dataset to {output_file}")
            logger.info(f"Final shape: {hourly_df.shape}")
            logger.info(f"Date range: {hourly_df.index[0]} to {hourly_df.index[-1]}")
            logger.info("Columns:")
            for col in hourly_df.columns:
                logger.info(f"  - {col}")
            
        except Exception as e:
            logger.error(f"Error processing {pair}: {str(e)}")
            continue
    
    logger.info("\nDataset preparation completed!")


def verify_datasets(output_dir: str) -> None:
    """Verify the prepared datasets."""
    logger = logging.getLogger('dataset_preparation')
    output_path = Path(output_dir)
    
    logger.info("\nVerifying prepared datasets:")
    
    for file in output_path.glob("*.parquet"):
        try:
            df = pd.read_parquet(file)
            logger.info(f"\n{file.name}:")
            logger.info(f"  Rows: {len(df)}")
            logger.info(f"  Date range: {df.index[0]} to {df.index[-1]}")
            logger.info(f"  Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
            
            # Check for missing values
            missing = df.isnull().sum()
            if missing.any():
                logger.warning(f"  Missing values found in {file.name}:")
                for col, count in missing[missing > 0].items():
                    logger.warning(f"    {col}: {count} missing values")
            
        except Exception as e:
            logger.error(f"Error verifying {file}: {str(e)}")


if __name__ == "__main__":
    # Parse command line arguments if needed
    output_dir = "/Volumes/ssd_fat2/ai6_trading_bot/datasets/1h"
    
    # Prepare datasets
    prepare_datasets(output_dir)
    
    # Verify prepared datasets
    verify_datasets(output_dir)