In [1]:
# B3_Configurable_Minute_PerSymbol.py
# Predictor with Sliding Window, Per-Step HParam Tuning, and PTT
# Adapted for minute-level data from ohlcv.csv, processing per symbol.

import pandas as pd
import numpy as np
import time
import os
import warnings
import traceback
from datetime import datetime
import xgboost as xgb
import matplotlib.pyplot as plt

# Modeling Imports
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.exceptions import UndefinedMetricWarning

# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore')

# ==============================================================================
# --- Configuration ---
# ==============================================================================

# --- Data ---
CSV_FILE_PATH = 'ohlcv.csv'
# N_ROWS_INPUT: Removed, processing all rows per symbol after filtering.

# --- Target Definition ---
PREDICTION_WINDOW_MINUTES = 4 * 60 # How many MINUTES ahead to predict (e.g., 24 hours)
TARGET_THRESHOLD_PCT = 3.5         # Target threshold for positive class (>= this value is 1)

# --- Backtesting Windowing (Defined in MINUTES) ---
# Example: 8 weeks training = 8 * 7 * 24 * 60 minutes
TRAIN_WINDOW_MINUTES = 48 * 60
STEP_MINUTES = 12 * 60             # Retrain/predict step size (e.g., daily = 24*60 minutes)
# Test window size as a fraction of the training window
TEST_WINDOW_FRACTION = 0.2         # e.g., 0.2 means test window is 20% of train window

# --- Model & Tuning ---
# Fixed XGBoost parameters (not tuned in grid search)
XGB_FIXED_PARAMS = {
    'learning_rate': 0.086,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False, # Deprecated, use False
    'random_state': 42,
    'reg_lambda': 4.0,         # L2 Regularization (Example value)
    'n_jobs': -1               # Use all available CPU cores
}

# Parameter grid for GridSearchCV (Keep combinations reasonable)
XGB_PARAM_GRID_TUNE = {
    'max_depth': [6, 7, 9],          # 3 options
    'n_estimators': [112, 147],       # 2 options
    'subsample': [0.83, 0.92],         # 2 options
    'colsample_bytree': [0.68, 0.72],  # 2 options
    'reg_alpha': [0.14, 0.26]         # L1 Regularization (2 options)
} # Total combinations: 3 * 2 * 2 * 2 * 2 = 48

# Probability Threshold Tuning Range
PROBABILITY_THRESHOLD_RANGE = (0.10, 0.90) # Start to end (end exclusive)
PROBABILITY_THRESHOLD_STEP = 0.05

# ==============================================================================
# --- Derived Variables (Do not change these directly) ---
# ==============================================================================
# Window sizes are now directly the number of rows (minutes)
TEST_WINDOW_MINUTES = max(1, int(TEST_WINDOW_FRACTION * TRAIN_WINDOW_MINUTES)) # Ensure at least 1 row
THRESHOLD_SEARCH_RANGE = np.arange(
    PROBABILITY_THRESHOLD_RANGE[0],
    PROBABILITY_THRESHOLD_RANGE[1],
    PROBABILITY_THRESHOLD_STEP
)
try:
    grid_combinations = len(list(ParameterGrid(XGB_PARAM_GRID_TUNE))) # Calculate grid size
except TypeError: # Handle case where grid might be None or empty
    grid_combinations = 1

# ==============================================================================
# --- Feature Engineering Functions (Minute Based) ---
# ==============================================================================
def garman_klass_volatility_min(o, h, l, c, window_min):
    with np.errstate(divide='ignore', invalid='ignore'):
        log_hl = np.log(h / l.replace(0, np.nan))
        log_co = np.log(c / o.replace(0, np.nan))
    gk = 0.5 * (log_hl**2) - (2 * np.log(2) - 1) * (log_co**2)
    gk = gk.fillna(0)
    # Use min_periods appropriate for potentially large windows
    min_p = max(1, window_min // 4) # Adjust min_periods heuristic if needed
    rm = gk.rolling(window_min, min_periods=min_p).mean()
    rm = rm.clip(lower=0)
    return np.sqrt(rm)

def parkinson_volatility_min(h, l, window_min):
    with np.errstate(divide='ignore', invalid='ignore'):
        log_hl_sq = np.log(h / l.replace(0, np.nan))**2
    log_hl_sq = log_hl_sq.fillna(0)
    min_p = max(1, window_min // 4) # Adjust min_periods heuristic if needed
    rs = log_hl_sq.rolling(window_min, min_periods=min_p).sum()
    f = 1 / (4 * np.log(2) * window_min) if window_min > 0 else 0
    return np.sqrt(f * rs)

def calculate_features_min(df_input):
    """Calculates features for minute-level data."""
    df = df_input.copy()
    base_cols_numeric = ['open', 'high', 'low', 'close', 'volumefrom', 'volumeto'] # Updated column names
    for col in base_cols_numeric:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
        else: print(f"Warning: Missing base column '{col}'"); df[col] = 0

    if df[['open', 'high', 'low', 'close']].isnull().any().any():
        print("Warning: OHLC NaNs found. Dropping rows.")
        df = df.dropna(subset=['open', 'high', 'low', 'close'])
    if df.empty:
        print("Error: Empty DF after initial OHLC NaN drop.")
        return df # Return empty df

    # --- Feature Calculations (MINUTE BASED) ---
    df['price_change_1m_temp'] = df['close'].pct_change(periods=1) # 1-minute change

    with np.errstate(divide='ignore', invalid='ignore'):
        df['price_range_pct'] = (df['high'] - df['low']) / df['close'].replace(0, np.nan) * 100
        df['oc_change_pct'] = (df['close'] - df['open']) / df['open'].replace(0, np.nan) * 100

    # --- Volatility (Windows in Minutes) ---
    # Example: 12 hours = 720 minutes, 3 hours = 180 minutes
    df['garman_klass_720m'] = garman_klass_volatility_min(df['open'], df['high'], df['low'], df['close'], 12 * 60)
    df['parkinson_180m'] = parkinson_volatility_min(df['high'], df['low'], 3 * 60)

    min_periods_rolling = 2 # Minimum periods for short rolling windows

    # --- MAs / STDs (Windows in Minutes) ---
    # Short term
    df['ma_180m'] = df['close'].rolling(3 * 60, min_periods=max(min_periods_rolling, (3*60)//4)).mean()
    df['rolling_std_180m'] = df['close'].rolling(3 * 60, min_periods=max(min_periods_rolling, (3*60)//4)).std()

    # Lag Features (Periods in Minutes)
    # Example lags: 3h, 6h, 12h, 24h, 2d, 3d, 7d
    lag_periods_price_min = [3*60, 6*60, 12*60, 24*60, 48*60, 72*60, 168*60]
    lag_periods_volume_min = [3*60, 6*60, 12*60, 24*60]

    for lag in lag_periods_price_min:
        df[f'lag_{lag}m_price_return'] = df['price_change_1m_temp'].shift(lag) * 100

    df['volume_return_1m'] = df['volumefrom'].pct_change(periods=1) * 100 # Use 'volumefrom'
    for lag in lag_periods_volume_min:
        df[f'lag_{lag}m_volume_return'] = df['volume_return_1m'].shift(lag)

    # Longer term MAs / STDs (Windows in Minutes)
    # Example periods: 6h, 12h, 24h, 2d, 3d, 7d
    ma_periods_min = [6*60, 12*60, 24*60, 48*60, 72*60, 168*60]
    std_periods_min = [6*60, 12*60, 24*60, 48*60, 72*60, 168*60]
    min_p_long = 50 # Example: require at least 50 points for longer averages

    for p in ma_periods_min:
        df[f'ma_{p}m'] = df['close'].rolling(p, min_periods=max(min_p_long, p//4)).mean()
    for p in std_periods_min:
        # Std of minute returns
        df[f'rolling_std_{p}m'] = df['price_change_1m_temp'].rolling(p, min_periods=max(min_p_long, p//4)).std() * 100 # Scaled

    # --- ATR (Windows in Minutes) ---
    df['prev_close'] = df['close'].shift(1)
    df['hml'] = df['high'] - df['low']
    df['hmpc'] = np.abs(df['high'] - df['prev_close'])
    df['lmpc'] = np.abs(df['low'] - df['prev_close'])
    df['tr'] = df[['hml', 'hmpc', 'lmpc']].max(axis=1)
    atr_periods_min = [14 * 60, 24 * 60, 48 * 60] # Example: 14h, 24h, 48h
    min_p_atr = 20 # Min periods for ATR

    for p in atr_periods_min:
        df[f'atr_{p}m'] = df['tr'].rolling(p, min_periods=max(min_p_atr, p//4)).mean()
    df = df.drop(columns=['prev_close', 'hml', 'hmpc', 'lmpc', 'tr'])

    # --- Ratio Features (using minute-based MAs/STDs) ---
    epsilon = 1e-9
    # Example: Close / MA_24h (1440 min), Close / MA_7d (10080 min)
    for p in [24*60, 168*60]:
        mc = f'ma_{p}m'
        df[f'close_div_ma_{p}m'] = df['close'] / (df[mc] + epsilon) if mc in df else np.nan

    # Example: MA_12h / MA_48h
    if 'ma_720m' in df and 'ma_2880m' in df: # 12h = 720m, 48h = 2880m
        df['ma720_div_ma2880'] = df['ma_720m'] / (df['ma_2880m'] + epsilon)
    else: df['ma720_div_ma2880'] = np.nan

    # Example: MA_24h / MA_7d
    if 'ma_1440m' in df and 'ma_10080m' in df: # 24h = 1440m, 7d = 10080m
        df['ma1440_div_ma10080'] = df['ma_1440m'] / (df['ma_10080m'] + epsilon)
    else: df['ma1440_div_ma10080'] = np.nan

    # Example: Std_12h / Std_72h
    if 'rolling_std_720m' in df and 'rolling_std_4320m' in df: # 12h=720m, 72h=4320m
        df['std720_div_std4320'] = df['rolling_std_720m'] / (df['rolling_std_4320m'] + epsilon)
    else: df['std720_div_std4320'] = np.nan

    # --- Interaction Features ---
    if 'price_range_pct' in df:
        df['volumefrom_x_range'] = df['volumefrom'] * df['price_range_pct'] # Use volumefrom
    else: df['volumefrom_x_range'] = np.nan

    # --- Transformation Features ---
    if 'rolling_std_180m' in df: # Use 3h = 180m STD
        df['rolling_std_180m_sq'] = df['rolling_std_180m']**2
    else: df['rolling_std_180m_sq'] = np.nan

    if 'price_change_1m_temp' in df:
        df['price_return_1m_sq'] = df['price_change_1m_temp']**2 * 10000 # Scale sq return
    else: df['price_return_1m_sq'] = np.nan

    if 'rolling_std_720m' in df: # Use 12h = 720m STD
        df['rolling_std_720m_sqrt'] = np.sqrt(df['rolling_std_720m'].clip(lower=0) + epsilon)
    else: df['rolling_std_720m_sqrt'] = np.nan

    # --- Cleanup Intermediate ---
    cols_to_drop_intermediate = ['price_change_1m_temp', 'volume_return_1m']
    df = df.drop(columns=[col for col in cols_to_drop_intermediate if col in df.columns])

    return df

# ==============================================================================
# --- Main Script Logic ---
# ==============================================================================

overall_start_time = time.time()

# --- 1. Load Data ---
print("--- Data Loading ---")
print(f"Loading data from: {CSV_FILE_PATH}")
try:
    # Read directly, parse timestamp
    df_full = pd.read_csv(CSV_FILE_PATH, parse_dates=['timestamp'])
    # Sort globally first
    df_full = df_full.sort_values(by=['symbol', 'timestamp'], ascending=True).reset_index(drop=True)
    print(f"Loaded {len(df_full)} rows for symbols: {df_full['symbol'].unique()}")
except FileNotFoundError: print(f"Error: {CSV_FILE_PATH} not found."); exit()
except Exception as e: print(f"Error loading data: {e}"); exit()

# --- Get unique symbols ---
unique_symbols = df_full['symbol'].unique()
all_symbol_results = {} # Store results per symbol

# --- Loop through each symbol ---
for symbol in unique_symbols:
    print(f"\n{'='*30} Processing Symbol: {symbol} {'='*30}")
    symbol_start_time = time.time()

    # Filter data for the current symbol
    df_symbol = df_full[df_full['symbol'] == symbol].copy()
    # Ensure it's sorted by time for this symbol (should be already, but double-check)
    df_symbol = df_symbol.sort_values(by='timestamp').reset_index(drop=True)
    print(f"Processing {len(df_symbol)} rows for {symbol}")

    if len(df_symbol) < TRAIN_WINDOW_MINUTES + STEP_MINUTES:
        print(f"Warning: Insufficient data for {symbol} ({len(df_symbol)} rows) for the initial training window ({TRAIN_WINDOW_MINUTES}) + step ({STEP_MINUTES}). Skipping symbol.")
        continue

    # --- 2. Feature Engineering (for this symbol) ---
    print(f"\n--- Feature Engineering for {symbol} ---")
    start_fe = time.time()
    df_symbol = calculate_features_min(df_symbol)
    if df_symbol.empty:
        print(f"Error: Feature calculation resulted in empty DataFrame for {symbol}. Skipping symbol.")
        continue
    print(f"Feature engineering for {symbol} complete. Took {time.time() - start_fe:.2f} seconds.")
    print(f"Columns after features: {df_symbol.shape[1]}")


    # --- 3. Define Target Variable (for this symbol) ---
    print("\n--- Target Definition ---")
    print(f"Defining target as {PREDICTION_WINDOW_MINUTES}m future return >= {TARGET_THRESHOLD_PCT}%...")
    target_col = f'target_return_{PREDICTION_WINDOW_MINUTES}m' # Dynamic target column name
    # Shift within the symbol's data
    df_symbol[target_col] = df_symbol['close'].shift(-PREDICTION_WINDOW_MINUTES).sub(df_symbol['close']).div(df_symbol['close'].replace(0, np.nan)).mul(100)

    # --- 4. Prepare Data for Modeling (for this symbol) ---
    print("\n--- Data Preparation ---")
    # Define base columns to exclude from features (using new names)
    base_cols_ohlcv = ['open', 'high', 'low', 'close', 'volumefrom', 'volumeto']
    # Columns to keep for indexing/reference, excluding target initially
    cols_to_keep_final = ['timestamp', 'symbol', target_col] # Use 'timestamp'
    potential_feature_cols = [col for col in df_symbol.columns if col not in cols_to_keep_final and col not in base_cols_ohlcv]

    # Select only numeric features among potential ones
    numeric_feature_cols = df_symbol[potential_feature_cols].select_dtypes(include=np.number).columns.tolist()
    final_feature_cols = numeric_feature_cols # Use all derived numeric features

    # Select final columns including features, target, and identifiers
    cols_to_select = final_feature_cols + [col for col in cols_to_keep_final if col in df_symbol.columns]
    df_model_ready = df_symbol[cols_to_select].copy()

    # --- NaN / Inf Handling ---
    # Drop columns with too many NaNs instead of dropping rows
    initial_cols = len(final_feature_cols)
    nan_threshold = 0.3  # Drop columns with more than 30% NaN values
    
    # Calculate NaN percentage for each feature column
    nan_percentages = df_model_ready[final_feature_cols].isna().mean()
    cols_to_drop = nan_percentages[nan_percentages > nan_threshold].index.tolist()
    
    if cols_to_drop:
        print(f"Dropping {len(cols_to_drop)} columns with >30% NaN values: {cols_to_drop}")
        df_model_ready = df_model_ready.drop(columns=cols_to_drop)
        final_feature_cols = [col for col in final_feature_cols if col not in cols_to_drop]
    
    # Now drop rows with NaNs in the remaining features or target
    initial_rows = len(df_model_ready)
    df_model_ready = df_model_ready.dropna(subset=final_feature_cols + [target_col])
    final_rows = len(df_model_ready)
    print(f"NaN Handling: Dropped {len(cols_to_drop)} columns with high NaN rate and {initial_rows - final_rows} rows with remaining NaNs.")

    # Check for infinites in numeric feature columns only
    numeric_cols_final_check = df_model_ready[final_feature_cols].select_dtypes(include=np.number).columns.tolist()
    if not numeric_cols_final_check:
         print(f"Error: No numeric feature columns found for {symbol} after NaN drop. Skipping.")
         continue

    inf_mask = np.isinf(df_model_ready[numeric_cols_final_check])
    inf_count = inf_mask.sum().sum()
    if inf_count > 0:
        print(f"Replacing {inf_count} infinites with NaN in features...")
        df_model_ready.replace([np.inf, -np.inf], np.nan, inplace=True)
        rows_b4 = len(df_model_ready)
        # Re-drop NaNs only if infinites were introduced in feature columns
        df_model_ready = df_model_ready.dropna(subset=final_feature_cols)
        print(f"Dropped {rows_b4 - len(df_model_ready)} more rows after Inf handling.")

    if df_model_ready.empty:
        print(f"Error: DataFrame empty after NaN/Inf handling for {symbol}. Skipping symbol.")
        continue

    # --- Final X, y, timestamps ---
    X = df_model_ready[final_feature_cols]
    y_binary = (df_model_ready[target_col] >= TARGET_THRESHOLD_PCT).astype(int)
    timestamps = df_model_ready['timestamp'] # Use the 'timestamp' column

    print(f"Final feature matrix shape for {symbol}: {X.shape}")
    print(f"Target vector shape for {symbol}: {y_binary.shape}")
    print(f"Using {len(final_feature_cols)} features.")
    # print(f"Feature list: {final_feature_cols}") # Uncomment to debug features

    # --- 5. SLIDING Window Backtesting (for this symbol) ---
    print(f"\n--- Starting SLIDING Window Backtest for {symbol} ---")
    print("!!! WARNING: This will be significantly slower due to GridSearchCV in each step !!!")
    if len(X) < TRAIN_WINDOW_MINUTES + STEP_MINUTES:
        print(f"Error: Not enough data for {symbol} ({len(X)}) for train window ({TRAIN_WINDOW_MINUTES}) + step ({STEP_MINUTES}). Skipping backtest for symbol.")
        continue

    # Reset storage for this symbol
    symbol_predictions_proba = []; symbol_actual = []; symbol_timestamps = []
    symbol_best_params = []
    num_steps = 0
    # Adjust end index for test window size
    start_index_loop = TRAIN_WINDOW_MINUTES
    end_index_loop = len(X) - TEST_WINDOW_MINUTES + 1 # Ensure test window fits

    print(f"Train Window: {TRAIN_WINDOW_MINUTES} mins, Step: {STEP_MINUTES} mins, Test Window: {TEST_WINDOW_MINUTES} mins, Tuning Grid Size: {grid_combinations}")
    loop_start_time = time.time()

    for i in range(start_index_loop, end_index_loop, STEP_MINUTES):
        step_start_time = time.time()
        train_idx_start = i - TRAIN_WINDOW_MINUTES
        train_idx_end = i
        test_idx_start = i
        test_idx_end = min(i + TEST_WINDOW_MINUTES, len(X)) # Ensure test end index is within bounds

        if test_idx_start >= test_idx_end: # Should not happen with len check above, but safety
            print(f"Stopping loop: Test window invalid ({test_idx_start} >= {test_idx_end}).")
            break

        # Get train and test sets for this step
        X_train_roll = X.iloc[train_idx_start : train_idx_end]
        y_train_roll = y_binary.iloc[train_idx_start : train_idx_end]
        X_test_roll = X.iloc[test_idx_start : test_idx_end]
        y_test_roll_actual_series = y_binary.iloc[test_idx_start : test_idx_end]
        step_timestamps = timestamps.iloc[test_idx_start : test_idx_end] # Timestamps for the test window
        current_timestamp = step_timestamps.iloc[0] # Timestamp for the start of the test window

        if X_train_roll.empty or len(np.unique(y_train_roll)) < 2:
            print(f"Warning: Skipping step starting at index {i} for {symbol}. Invalid training data (empty or single class).")
            continue

        print(f"\n--- {symbol} - Step {num_steps + 1} (Predicting window starting {current_timestamp}) ---")
        print(f"  Training indices: [{train_idx_start}:{train_idx_end-1}]; Testing indices: [{test_idx_start}:{test_idx_end-1}]")

        # --- Hyperparameter Tuning ---
        print(f"  Running GridSearchCV (cv=3, scoring='f1')...")
        grid_search_start_time = time.time()
        try:
            estimator = xgb.XGBClassifier(**XGB_FIXED_PARAMS)
            # Use StratifiedKFold with shuffle, random_state tied to step for reproducibility if desired
            cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=i)
            grid_search = GridSearchCV(
                estimator=estimator, param_grid=XGB_PARAM_GRID_TUNE, scoring='f1',
                cv=cv_strategy, n_jobs=-1, verbose=0
            )
            grid_search.fit(X_train_roll, y_train_roll) # Fit GridSearch
            best_params_step = grid_search.best_params_
            best_score_step = grid_search.best_score_
            print(f"  GridSearchCV finished in {time.time() - grid_search_start_time:.2f}s.")
            print(f"  Best Params: {best_params_step}, Best CV F1: {best_score_step:.4f}")
            symbol_best_params.append({'step': num_steps + 1, 'params': best_params_step, 'cv_f1': best_score_step})

            # --- Fit final model for the step ---
            final_model_params = {**XGB_FIXED_PARAMS, **best_params_step}
            model_roll = xgb.XGBClassifier(**final_model_params)
            model_roll.fit(X_train_roll, y_train_roll, verbose=False) # Fit final model

            # --- Predict probabilities for the test window ---
            prob_roll_window = model_roll.predict_proba(X_test_roll)[:, 1] # Prob of positive class

            # --- Store results for this symbol ---
            symbol_predictions_proba.extend(prob_roll_window)
            symbol_actual.extend(y_test_roll_actual_series.tolist())
            symbol_timestamps.extend(step_timestamps.tolist())
            num_steps += 1

        except Exception as e_step:
            print(f"!! Error during GridSearch/Fit/Predict at step starting {i} for {symbol}: {e_step}")
            # traceback.print_exc() # Uncomment for detailed error
            continue # Skip to next step

        step_end_time = time.time()
        print(f"  Step {num_steps} finished in {step_end_time - step_start_time:.2f}s total.")

    loop_end_time = time.time()
    print(f"\nBacktesting loop for {symbol} finished. Completed {num_steps} steps (each predicting up to {TEST_WINDOW_MINUTES} points) in {(loop_end_time - loop_start_time)/60:.2f} minutes.")

    # --- 6. Evaluate Backtesting Results with PTT (for this symbol) ---
    if num_steps > 0 and len(symbol_predictions_proba) == len(symbol_actual):
        print(f"\n--- Evaluating Results for {symbol} with Probability Threshold Tuning ---")
        print(f"Threshold search range: {THRESHOLD_SEARCH_RANGE}")
        best_threshold = 0.5; best_f1_thresh = -1.0
        results_per_threshold = {}
        probabilities_np = np.array(symbol_predictions_proba)
        actual_np = np.array(symbol_actual)

        for t in THRESHOLD_SEARCH_RANGE:
            predictions_thresh = (probabilities_np >= t).astype(int)
            # Use same robust metric calculation as before
            if np.sum(actual_np) == 0 and np.sum(predictions_thresh) == 0: acc_t, pre_t, rec_t, f1_t = 1.0, 1.0, 1.0, 1.0
            elif np.sum(actual_np) > 0 and np.sum(predictions_thresh) == 0: acc_t = accuracy_score(actual_np, predictions_thresh); pre_t, rec_t, f1_t = 0.0, 0.0, 0.0
            elif np.sum(actual_np) == 0 and np.sum(predictions_thresh) > 0: acc_t = accuracy_score(actual_np, predictions_thresh); pre_t, rec_t, f1_t = 0.0, 0.0, 0.0
            else:
                 acc_t = accuracy_score(actual_np, predictions_thresh)
                 pre_t = precision_score(actual_np, predictions_thresh, zero_division=0)
                 rec_t = recall_score(actual_np, predictions_thresh, zero_division=0)
                 f1_t = f1_score(actual_np, predictions_thresh, zero_division=0)

            results_per_threshold[round(t, 2)] = {'f1': f1_t, 'acc': acc_t, 'pre': pre_t, 'rec': rec_t}
            # Select best threshold based on F1 score
            if f1_t >= best_f1_thresh:
                 # Tie-breaking: prefer threshold closer to 0.5 if F1 is equal
                 if f1_t > best_f1_thresh or abs(t - 0.5) < abs(best_threshold - 0.5):
                      best_f1_thresh = f1_t
                      best_threshold = t

        print(f"\nBest Threshold for {symbol} found: {best_threshold:.2f} (Yielding F1 Score: {best_f1_thresh:.4f})")

        # Calculate final metrics using the best threshold for this symbol
        final_predictions_optimized = (probabilities_np >= best_threshold).astype(int)
        final_accuracy = accuracy_score(actual_np, final_predictions_optimized)
        final_precision = precision_score(actual_np, final_predictions_optimized, zero_division=0)
        final_recall = recall_score(actual_np, final_predictions_optimized, zero_division=0)
        final_f1 = f1_score(actual_np, final_predictions_optimized, zero_division=0)

        print(f"\n--- Final Performance Metrics for {symbol} (Optimized Threshold) ---")
        print(f"Target: {PREDICTION_WINDOW_MINUTES}m return >= {TARGET_THRESHOLD_PCT}%")
        print(f"Windowing: Train={TRAIN_WINDOW_MINUTES}m, Step={STEP_MINUTES}m, Test Window={TEST_WINDOW_MINUTES}m")
        print(f"Total Individual Predictions Evaluated: {len(actual_np)}")
        print(f"Overall Accuracy:  {final_accuracy:.4f}")
        print(f"Overall Precision: {final_precision:.4f}")
        print(f"Overall Recall:    {final_recall:.4f}")
        print(f"Overall F1 Score:  {final_f1:.4f}")

        if 0.5 in results_per_threshold:
            res_def = results_per_threshold[0.5]
            print(f"(Compare: Default 0.5 Thresh -> F1:{res_def['f1']:.4f}, Acc:{res_def['acc']:.4f}, Pre:{res_def['pre']:.4f}, Rec:{res_def['rec']:.4f})")

        # Store results
        all_symbol_results[symbol] = {
            'probabilities': probabilities_np,
            'actuals': actual_np,
            'timestamps': symbol_timestamps,
            'best_threshold': best_threshold,
            'metrics_optimized': {'acc': final_accuracy, 'pre': final_precision, 'rec': final_recall, 'f1': final_f1},
            'metrics_default_0.5': results_per_threshold.get(0.5, {}),
            'best_params_per_step': symbol_best_params,
            'results_per_threshold': results_per_threshold
        }

        # --- 7. Plot Cumulative Accuracy (for this symbol) ---
        print(f"\nPlotting cumulative accuracy for {symbol} (optimized threshold)...")
        if len(symbol_timestamps) != len(actual_np):
             print(f"Warning: Timestamp length mismatch for {symbol}. Skipping plot.")
        else:
            try:
                cumulative_accuracy_list_optimized = (np.cumsum(final_predictions_optimized == actual_np) / np.arange(1, len(actual_np) + 1))
                plt.figure(figsize=(14, 7))
                plt.plot(symbol_timestamps, cumulative_accuracy_list_optimized, marker='.', linestyle='-', markersize=1, alpha=0.7, label=f'Cumulative Accuracy ({symbol})') # Very small markers

                # Rolling accuracy can be very noisy on minute data - use a large window or skip
                rolling_window_plot_size = max(TEST_WINDOW_MINUTES * 5, 60 * 24) # e.g., 5 test windows or 1 day
                if len(actual_np) > rolling_window_plot_size:
                     results_df = pd.DataFrame({
                         'correct': (final_predictions_optimized == actual_np).astype(int)
                     }, index=pd.to_datetime(symbol_timestamps))
                     # Use a time-based window if timestamps are regular, otherwise integer window
                     try:
                         rolling_acc = results_df['correct'].rolling(window=f'{rolling_window_plot_size}Min').mean() # Try time window first
                     except:
                         rolling_acc = results_df['correct'].rolling(window=rolling_window_plot_size).mean() # Fallback to row window

                     plt.plot(rolling_acc.index, rolling_acc, linestyle='--', color='red', label=f'Rolling Acc ({rolling_window_plot_size} min window)')


                plt.title(f'{symbol} Backtest (Train:{TRAIN_WINDOW_MINUTES}m, Step:{STEP_MINUTES}m, Test:{TEST_WINDOW_MINUTES}m) - Best Thresh: {best_threshold:.2f}')
                plt.xlabel('Timestamp'); plt.ylabel('Accuracy')
                min_y_plot = max(0.0, np.min(cumulative_accuracy_list_optimized) - 0.05 if len(cumulative_accuracy_list_optimized)>0 else 0.4)
                max_y_plot = min(1.0, np.max(cumulative_accuracy_list_optimized) + 0.05 if len(cumulative_accuracy_list_optimized)>0 else 0.8)
                # Avoid overly tight y-limits if accuracy is stable
                if max_y_plot - min_y_plot < 0.1:
                    mid_point = (max_y_plot + min_y_plot) / 2
                    min_y_plot = max(0.0, mid_point - 0.05)
                    max_y_plot = min(1.0, mid_point + 0.05)
                plt.ylim(min_y_plot, max_y_plot)

                plt.grid(True, linestyle='--', alpha=0.6); plt.legend(); plt.xticks(rotation=30, ha='right'); plt.tight_layout()
                # Save plot instead of showing directly in a loop
                plot_filename = f"backtest_accuracy_{symbol}.png"
                plt.savefig(plot_filename)
                print(f"Saved accuracy plot to {plot_filename}")
                plt.close() # Close the figure to avoid displaying multiple plots at the end
                # plt.show() # Uncomment if you want interactive plots (will pause script)

            except Exception as e_plot: print(f"Error plotting for {symbol}: {e_plot}")

    else:
        print(f"No predictions were made/stored for {symbol}, cannot evaluate or plot.")
        all_symbol_results[symbol] = None # Mark as failed/skipped

    symbol_end_time = time.time()
    print(f"\nFinished processing {symbol} in {(symbol_end_time - symbol_start_time)/60:.2f} minutes.")


# --- End of Symbol Loop ---

print(f"\n{'='*30} Overall Script Finished {'='*30}")
overall_end_time = time.time()
print(f"Total execution time: {(overall_end_time - overall_start_time)/60:.2f} minutes.")

# You can access results like this:
# print(all_symbol_results['BTC']['metrics_optimized'])
# print(all_symbol_results['SOL']['best_threshold'])

--- Data Loading ---
Loading data from: ohlcv.csv
Loaded 45164 rows for symbols: ['BTC' 'SOL']

Processing 22582 rows for BTC

--- Feature Engineering for BTC ---
Feature engineering for BTC complete. Took 0.03 seconds.
Columns after features: 49

--- Target Definition ---
Defining target as 720m future return >= 2.5%...

--- Data Preparation ---
NaN Handling: Dropped 10801 rows with NaNs in features or target.
Final feature matrix shape for BTC: (11781, 41)
Target vector shape for BTC: (11781,)
Using 41 features.

--- Starting SLIDING Window Backtest for BTC ---
Train Window: 1440 mins, Step: 60 mins, Test Window: 288 mins, Tuning Grid Size: 48

--- BTC - Step 1 (Predicting window starting 2025-04-11 01:01:00) ---
  Training indices: [0:1439]; Testing indices: [1440:1727]
  Running GridSearchCV (cv=3, scoring='f1')...
  GridSearchCV finished in 4.46s.
  Best Params: {'colsample_bytree': 0.72, 'max_depth': 6, 'n_estimators': 112, 'reg_alpha': 0.14, 'subsample': 0.92}, Best CV F1: 0.983

In [4]:
X.columns

Index(['open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD',
       'price_range_pct', 'oc_change_pct', 'garman_klass_12h', 'parkinson_3h',
       'ma_3h', 'rolling_std_3h', 'lag_3h_price_return', 'lag_6h_price_return',
       'lag_12h_price_return', 'lag_24h_price_return', 'lag_48h_price_return',
       'lag_72h_price_return', 'lag_168h_price_return', 'volume_return_1h',
       'lag_3h_volume_return', 'lag_6h_volume_return', 'lag_12h_volume_return',
       'lag_24h_volume_return', 'ma_6h', 'ma_12h', 'ma_24h', 'ma_48h',
       'ma_72h', 'ma_168h', 'rolling_std_6h', 'rolling_std_12h',
       'rolling_std_24h', 'rolling_std_48h', 'rolling_std_72h',
       'rolling_std_168h', 'atr_14h', 'atr_24h', 'atr_48h', 'close_div_ma_24h',
       'close_div_ma_48h', 'close_div_ma_168h', 'ma12_div_ma48',
       'ma24_div_ma168', 'std12_div_std72', 'volume_btc_x_range',
       'rolling_std_3h_sq', 'price_return_1h_sq', 'rolling_std_12h_sqrt'],
      dtype='object')

What Now?

This is great progress! It tells you that predicting 12-hour direction is a much more promising path with your data and feature types.

Stick with the Simpler Structure (for now): Keep the single model (XGBoost) and the expanding window backtest for now.

Optimize This Setup:

Apply VIF: Now that you have a working model structure and a seemingly viable target, apply VIF filtering (e.g., threshold 5 or even your strict 1.69) to the features generated in this simpler script. Does reducing collinearity now improve the already decent results?

Tune Hyperparameters: Tune the XGBoost parameters (n_estimators, max_depth, learning_rate, reg_alpha, reg_lambda, subsample, colsample_bytree, min_child_weight) using a method like Optuna or RandomizedSearchCV within the rolling backtest loop (similar to how the meta-learner was tuned, but now for the single main model).

Experiment with Target Horizon: Is 12 hours optimal for the >0% target? Try 8 hours, 24 hours.

Experiment with Training Window: Does the expanding window work best, or would a large sliding window perform better for this target?

You've found a much better baseline. Now optimize it systematically!