In [None]:

import os
import sys
import pandas as pd
import numpy as np
import pytz
from typing import List, Optional
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from unbiased_data import process_currency_pairs, prepare_unbiased_dataset_row_by_row

from data_management.indicator_manager import IndicatorManager
from data_management.preprocessor import DataPreprocessor

indicator_manager = IndicatorManager()
processor = DataPreprocessor()

import logging
import pandas as pd

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dataset_prep.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('dataset_prep')



ccy = 'EUR_USD'

logger.info(f'Starting processing for {ccy} at {pd.Timestamp.now()}')
df = pd.read_parquet(f'/Users/floriankockler/Library/CloudStorage/OneDrive-kockler/usb_stick_6dec/1min_source/{ccy}.parquet')
# df = df.head(1_000_000)

df_with_indicators = prepare_unbiased_dataset_row_by_row(
            df=df,

            indicator_manager=indicator_manager,
            indicator_timeframe='D',
            verbose=True
        )
df_with_indicators = df_with_indicators.dropna()

output_path_not_norm = f'./{ccy}_5min_1D_all_indic_not_norm_unbiased.parquet'
df_with_indicators.to_parquet(output_path_not_norm)

df_norm = processor.normalize_simple(df=df_with_indicators)

output_path = f'./{ccy}_5min_1D_all_indic_norm_unbiased.parquet'
df_norm.to_parquet(output_path)

logger.info(f'Finished processing for {ccy} at {pd.Timestamp.now()}')
df_with_indicators
 

2024-12-08 15:11:27,954 - dataset_prep - INFO - Starting processing for EUR_USD at 2024-12-08 15:11:27.950971


2024-12-08 15:11:39,048 - dataset_prep - INFO - Starting data preparation using row-by-row method...
2024-12-08 15:11:40,124 - dataset_prep - INFO - Resampled to 5-minute candles. Shape: (1753845, 4)
Processing rows:  13%|█▎        | 233472/1753845 [30:16<3:07:41, 135.00it/s]

Index(['open', 'high', 'low', 'close', 'sma_20', 'sma_50', 'rsi', 'macd',
       'macd_signal', 'macd_hist', 'roc_10', 'stoch_rsi', 'stoch_k', 'stoch_d',
       'bb_upper', 'bb_middle', 'bb_lower', 'bb_bandwidth', 'bb_percent',
       'atr', 'plus_di', 'minus_di', 'adx', 'senkou_span_a', 'senkou_span_b',
       'tenkan_sen', 'kijun_sen'],
      dtype='object')

In [None]:


from unbiased_data import process_currency_pairs, prepare_unbiased_dataset_row_by_row

from data_management.indicator_manager import IndicatorManager
from data_management.preprocessor import DataPreprocessor

indicator_manager = IndicatorManager()
processor = DataPreprocessor()

import logging


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dataset_prep.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('dataset_prep')




currencies_1 = [
            'GBP_CHF', 'GBP_JPY', 'EUR_CHF', 
 
        ]
currencies_2 = [

            'EUR_CAD', 'EUR_USD', 'GBP_USD', 
    
        ]
currencies_3 = [

            'USD_CAD', 'AUD_USD', 'CHF_JPY', 
 
        ]
currencies_4 = [

            'NZD_JPY', 'XAU_USD', 'XAG_USD', 
        ]
currencies_5 = [

            'USD_CHF', 'USD_JPY', 'AUD_JPY', 
        ]
currencies_6 = [

            'EUR_JPY', 'EUR_GBP', 'NZD_USD',
        ]

eur_only = ['EUR_USD']

for ccy in eur_only:

    logger.info(f'Starting processing for {ccy} at {pd.Timestamp.now()}')
    df = pd.read_parquet(f'/Users/floriankockler/Library/CloudStorage/OneDrive-kockler/usb_stick_6dec/1min_source/{ccy}.parquet')
    # df = df.head(3_000_000)
    
    df_with_indicators = prepare_unbiased_dataset_row_by_row(
                df=df,
                indicator_manager=indicator_manager,
                indicator_timeframe='D',
                verbose=True
            )
    df_with_indicators = df_with_indicators.dropna()
    
    output_path_not_norm = f'./{ccy}_5min_1D_indic_not_norm_unbiased_full.parquet'
    df_with_indicators.to_parquet(output_path_not_norm)
    
    df_norm = processor.normalize_simple(df=df_with_indicators)
    
    output_path = f'./{ccy}_5min_1D_norm_unbiased_full.parquet'
    df_norm.to_parquet(output_path)
    
    logger.info(f'Finished processing for {ccy} at {pd.Timestamp.now()}')
 

2024-12-07 18:55:20,580 - dataset_prep - INFO - Starting processing for EUR_USD at 2024-12-07 18:55:20.580834
2024-12-07 18:55:24,713 - dataset_prep - INFO - Starting data preparation using row-by-row method...
2024-12-07 18:55:25,365 - dataset_prep - INFO - Resampled to 5-minute candles. Shape: (1753845, 4)
Processing rows: 100%|██████████| 1753845/1753845 [3:27:14<00:00, 141.05it/s]  
2024-12-07 22:28:18,506 - dataset_prep - INFO - 
Final dataset prepared. Shape: (1753845, 23)
2024-12-07 22:28:18,514 - dataset_prep - INFO - Date range: 2001-01-02 23:00:00+00:00 to 2024-11-28 10:15:00+00:00
2024-12-07 22:28:18,763 - dataset_prep - INFO - 
Percentage of NaN values in indicator columns:
2024-12-07 22:28:18,764 - dataset_prep - INFO - sma_20: 0.24%
2024-12-07 22:28:18,764 - dataset_prep - INFO - sma_50: 0.64%
2024-12-07 22:28:18,765 - dataset_prep - INFO - rsi: 0.18%
2024-12-07 22:28:18,765 - dataset_prep - INFO - macd: 0.44%
2024-12-07 22:28:18,766 - dataset_prep - INFO - macd_signal: 0

In [None]:
train_set = f'/Users/floriankockler/Code/GitHub.nosync/ai6-gcp-bot/forex_trading_system/notebooks/EUR_USD_5min_1H_norm_unbiased.parquet'
df = pd.read_parquet(train_set)
# df.isna().any()
# np.isinf(df).any()

df['bb_percent'] = df['bb_percent'].replace([np.inf, -np.inf], [1, 0])
df.to_parquet("/Users/floriankockler/Code/GitHub.nosync/ai6-gcp-bot/forex_trading_system/notebooks/EUR_USD_5min_1H_norm_unbiased1.parquet")

In [None]:
not_norm = pd.read_parquet(f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/{ccy}.parquet')
not_norm


In [None]:
norm = pd.read_parquet(f'./{ccy}_5min_1H_norm_unbiased.parquet')
norm

In [None]:
# import pandas as pd
# df = pd.read_parquet(f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/EUR_USD.parquet')
# df_test = df.head(5000)
# df_test.to_parquet(f'/Volumes/ssd_fat2/ai6_trading_bot/datasets/1min/EUR_USD_test.parquet')