In [None]:
import pandas as pd
import numpy as np
import time
import os
import warnings
import traceback
from datetime import datetime

# Feature Engineering Imports
import pandas_ta as ta  # Technical indicators

# Modeling Imports
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid
from sklearn.exceptions import UndefinedMetricWarning

# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore') # General suppression

# --- Configuration ---

# Data Loading
CSV_FILE_PATH = r'C:\Users\mason\AVP\BTCUSD.csv' # Use raw string for Windows paths
SYMBOL_NAME = 'BTCUSD' # Define the symbol represented in the CSV

# Feature Selection (Using the list provided)
SELECTED_FEATURE_NAMES = [
    'Volume BTC', 'Volume USD', 'bband_width_20h', 'cci_20h', 'close_pos_in_range',
    'cmf_20h', 'day_0', 'day_1', 'day_2', 'day_4', 'day_5', 'day_6', 'hour_0',
    'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
    'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_2', 'hour_20', 'hour_21',
    'hour_22', 'hour_23', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_8',
    'hour_9', 'lag_12h_price_return', 'lag_12h_volume_return', 'lag_168h_price_return',
    'lag_24h_price_return', 'lag_24h_volume_return', 'lag_3h_volume_return',
    'lag_48h_price_return', 'lag_6h_volume_return', 'lag_72h_price_return',
    'macd_hist', 'macd_signal', 'rolling_kurt_24h', 'rolling_skew_24h',
    'rolling_std_168h', 'rolling_std_3h_sq', 'rolling_std_48h', 'rolling_std_6h',
    'std12_div_std72', 'volume_btc_x_range', 'volume_div_ma_24h', 'volume_ma_12h',
    'volume_ma_168h', 'volume_return_1h'
]

# Modeling & Walk-Forward
TARGET_THRESHOLD_PCT = 0.1 # Target threshold percentage variable

# Walk-forward params (Suggestions for hourly data)
TRAIN_WINDOW_HOURS = int(24 * 30 * 1.5) # Approx 1.5 months training (~1080 hours)
TEST_WINDOW_HOURS = 24 * 3           # Predict next 3 days (72 hours)
STEP_HOURS = 24                      # Step forward 1 day

# Convert time durations to rows (assuming 1 row = 1 hour)
TRAIN_WINDOW_ROWS = TRAIN_WINDOW_HOURS
TEST_WINDOW_ROWS = TEST_WINDOW_HOURS
STEP_ROWS = STEP_HOURS

# Inner Cross-Validation Grid Search Configuration (Keep small)
INNER_CV_PARAM_GRID = {
    'max_depth': [3, 4],         # Keep shallow
    'n_estimators': [100, 150],  # Moderate number
    'eta': [0.05, 0.1],          # Learning rate
    # Add other params to tune if desired, e.g., 'subsample', 'colsample_bytree'
}
INNER_CV_VALIDATION_PCT = 0.20 # Use last 20% of training data for quick validation

# Fixed XGBoost parameters (Suggestions)
XGB_FIXED_PARAMS = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',    # Logloss is common for probability calibration
    'colsample_bytree': 0.7,     # Feature fraction per tree
    'subsample': 0.8,            # Data fraction per tree
    'min_child_weight': 3,       # Regularization
    'gamma': 0.1,                # Regularization
    'lambda': 1.5,               # L2 regularization
    'alpha': 0.1,                # L1 regularization
    'random_state': 42,
    'n_jobs': -1,                # Use all available CPU cores
    'tree_method': 'hist',       # Efficient for CPU and handles NaNs
    'use_label_encoder': False,  # Recommended to set explicitly
    # 'enable_categorical': True # Set if using categorical features directly (not needed here)
    # Add GPU params if desired and available:
    # 'tree_method': 'gpu_hist',
    # 'predictor': 'gpu_predictor'
}

# Probability Threshold Tuning Configuration
THRESHOLD_SEARCH_RANGE = np.arange(0.10, 0.90, 0.05) # Search thresholds from 0.1 to 0.85

# --- Feature Engineering Function (Optimized for 58 Features) ---

def calculate_selected_features(df, symbol):
    """Calculates only the 58 pre-selected features."""
    print("Starting calculation of selected 58 features...")
    if df is None or len(df) < 3: return pd.DataFrame()
    df = df.copy()
    df['symbol'] = symbol

    # --- Timestamp and Index ---
    if 'timestamp' not in df.columns: return pd.DataFrame()
    try: df['timestamp'] = pd.to_datetime(df['timestamp'])
    except Exception: return pd.DataFrame()
    df = df.sort_values('timestamp').dropna(subset=['timestamp'])
    df = df.set_index('timestamp', drop=False)

    # --- Volume Columns ---
    if 'Volume BTC' in df.columns: df['volume_btc'] = df['Volume BTC']
    else: df['volume_btc'] = 0
    if 'Volume USD' in df.columns: df['volume_usd'] = df['Volume USD']
    else: df['volume_usd'] = 0

    # --- Basic Checks ---
    required_ohlc = ['open', 'high', 'low', 'close']
    for col in required_ohlc:
        if col not in df.columns: return pd.DataFrame()
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if df[required_ohlc].isnull().any().any():
        df = df.dropna(subset=required_ohlc)
    if df.empty: return pd.DataFrame()

    # --- Feature Calculation Prerequisites (Calculate things needed by the 58) ---
    min_periods_base = 2
    # Need MA24 for volume_div_ma_24h
    if len(df) >= 24:
        df['ma_24h_temp'] = df['close'].rolling(window=24, min_periods=min_periods_base).mean()
    else: df['ma_24h_temp'] = np.nan
    # Need std12 and std72 for std12_div_std72
    if len(df) >= 12:
        df['rolling_std_12h_temp'] = df['close'].rolling(window=12, min_periods=min_periods_base).std()
    else: df['rolling_std_12h_temp'] = np.nan
    if len(df) >= 72:
        df['rolling_std_72h_temp'] = df['close'].rolling(window=72, min_periods=min_periods_base).std()
    else: df['rolling_std_72h_temp'] = np.nan
    # Need base MACD components for macd_signal and macd_hist
    if len(df) >= 26:
        ema_12 = df['close'].ewm(span=12, adjust=False, min_periods=12).mean()
        ema_26 = df['close'].ewm(span=26, adjust=False, min_periods=26).mean()
        df['macd_temp'] = ema_12 - ema_26
    else: df['macd_temp'] = np.nan
    # Need H/L/C for close_pos_in_range
    df['price_range_pct_temp'] = (df['high'] - df['low']) / df['low'] # Needed for volume_btc_x_range

    # --- Calculate the 58 Selected Features ---

    # Time Features
    hour_of_day = df.index.hour
    day_of_week = df.index.dayofweek
    selected_hours = [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
    selected_days = [0, 1, 2, 4, 5, 6]
    for hour in selected_hours: df[f'hour_{hour}'] = (hour_of_day == hour).astype(int)
    for day in selected_days: df[f'day_{day}'] = (day_of_week == day).astype(int)

    # Volume Features (Directly requested)
    df['Volume BTC'] = df['volume_btc']
    df['Volume USD'] = df['volume_usd']
    df['volume_return_1h'] = df['volume_btc'].pct_change()
    for hours in [12, 168]:
        if len(df) >= hours: df[f'volume_ma_{hours}h'] = df['volume_btc'].rolling(window=hours, min_periods=min_periods_base).mean()
        else: df[f'volume_ma_{hours}h'] = np.nan
    if 'ma_24h_temp' in df.columns: # Use prerequisite
         with np.errstate(divide='ignore', invalid='ignore'):
              df['volume_div_ma_24h'] = df['volume_btc'] / df['ma_24h_temp']
    else: df['volume_div_ma_24h'] = np.nan
    if 'price_range_pct_temp' in df.columns: # Use prerequisite
         df['volume_btc_x_range'] = df['volume_btc'] * df['price_range_pct_temp']
    else: df['volume_btc_x_range'] = np.nan


    # Lagged Returns
    for hours in [12, 24, 48, 72, 168]: df[f'lag_{hours}h_price_return'] = df['close'].pct_change(periods=hours)
    for hours in [3, 6, 12, 24]: df[f'lag_{hours}h_volume_return'] = df['volume_btc'].pct_change(periods=hours)

    # Rolling Stats
    if len(df) >= 6: df['rolling_std_6h'] = df['close'].rolling(window=6, min_periods=min_periods_base).std()
    else: df['rolling_std_6h'] = np.nan
    if len(df) >= 48: df['rolling_std_48h'] = df['close'].rolling(window=48, min_periods=min_periods_base).std()
    else: df['rolling_std_48h'] = np.nan
    if len(df) >= 168: df['rolling_std_168h'] = df['close'].rolling(window=168, min_periods=min_periods_base).std()
    else: df['rolling_std_168h'] = np.nan
    if len(df) >= 3: df['rolling_std_3h_sq'] = (df['close'].rolling(window=3, min_periods=min_periods_base).std())**2
    else: df['rolling_std_3h_sq'] = np.nan
    if len(df) >= 24:
        df['rolling_skew_24h'] = df['close'].pct_change().rolling(window=24, min_periods=24).skew()
        df['rolling_kurt_24h'] = df['close'].pct_change().rolling(window=24, min_periods=24).kurt()
    else: df['rolling_skew_24h'], df['rolling_kurt_24h'] = np.nan, np.nan

    # MACD Features (using prerequisite)
    if 'macd_temp' in df.columns and len(df) >= 35: # Need 26 for macd_temp + 9 for signal
        df['macd_signal'] = df['macd_temp'].ewm(span=9, adjust=False, min_periods=9).mean()
        df['macd_hist'] = df['macd_temp'] - df['macd_signal']
    else: df['macd_signal'], df['macd_hist'] = np.nan, np.nan

    # Ratio Feature (using prerequisites)
    if 'rolling_std_12h_temp' in df.columns and 'rolling_std_72h_temp' in df.columns:
         with np.errstate(divide='ignore', invalid='ignore'):
             df['std12_div_std72'] = df['rolling_std_12h_temp'] / df['rolling_std_72h_temp']
    else: df['std12_div_std72'] = np.nan

    # TA-Lib Features
    ta_df = df.rename(columns={'volume_btc': 'volume'}, errors='ignore')
    if all(c in ta_df.columns for c in ['high', 'low', 'close']):
        # Bollinger Bands (Width only)
        try:
            bbands_df = ta_df.ta.bbands(length=20, std=2)
            if bbands_df is not None: df['bband_width_20h'] = bbands_df.get(f'BBB_20_2.0', np.nan)
            else: df['bband_width_20h'] = np.nan
        except Exception: df['bband_width_20h'] = np.nan
        # CCI
        try: df['cci_20h'] = ta_df.ta.cci(length=20)
        except Exception: df['cci_20h'] = np.nan
        # CMF
        if 'volume' in ta_df.columns:
             try: df['cmf_20h'] = ta_df.ta.cmf(length=20)
             except Exception: df['cmf_20h'] = np.nan
        else: df['cmf_20h'] = np.nan
    else: # Set TA features to NaN if base columns missing
        df['bband_width_20h'], df['cci_20h'], df['cmf_20h'] = np.nan, np.nan, np.nan

    # Position in Range
    range_hl = df['high'] - df['low']
    with np.errstate(divide='ignore', invalid='ignore'):
        df['close_pos_in_range'] = ((df['close'] - df['low']) / range_hl).fillna(0.5).replace([np.inf, -np.inf], 0.5)

    # --- Final Selection & Cleanup ---
    # Select only the columns explicitly listed, plus timestamp and symbol for joining/sorting
    final_feature_cols_to_keep = [f for f in SELECTED_FEATURE_NAMES if f in df.columns]
    essential_cols = ['timestamp', 'symbol', 'open', 'high', 'low', 'close'] # Keep OHLC for target creation
    df_final = df[essential_cols + final_feature_cols_to_keep].copy()

    # Drop temporary prerequisite columns
    temp_cols = [c for c in df_final.columns if c.endswith('_temp')]
    df_final = df_final.drop(columns=temp_cols, errors='ignore')

    # Reset index and final check for infinities
    df_final = df_final.reset_index(drop=True)
    df_final = df_final.replace([np.inf, -np.inf], np.nan)

    print(f"Selected feature calculation finished. Returning {len(df_final)} rows with {len(df_final.columns)} columns.")
    return df_final


# --- Main Execution Block ---
if __name__ == "__main__":

    print("--- 1. Data Loading & Initial Prep ---")
    try:
        print(f"Loading data from: {CSV_FILE_PATH}")
        col_names = ['unix', 'date', 'symbol_csv', 'open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD']
        df_raw = pd.read_csv(CSV_FILE_PATH, header=0, names=col_names)
        print(f"Raw data loaded. Shape: {df_raw.shape}")

        df_raw['timestamp'] = pd.to_datetime(df_raw['date'])
        df_raw = df_raw.drop(['unix', 'date', 'symbol_csv'], axis=1)
        df_raw = df_raw.sort_values('timestamp').reset_index(drop=True)
        if df_raw.empty: exit("DataFrame empty after loading. Exiting.")
        print(f"Initial data prep done. Shape: {df_raw.shape}")

    except Exception as e:
        print(f"Error loading or processing CSV: {e}"); traceback.print_exc(); exit()

    print("\n--- 2. Feature Engineering (Selected Features) ---")
    feature_calc_start = time.time()
    df_features = calculate_selected_features(df_raw, symbol=SYMBOL_NAME)
    feature_calc_end = time.time()
    if df_features.empty: exit("Feature calculation failed. Exiting.")
    print(f"Feature calculation completed in {feature_calc_end - feature_calc_start:.2f} seconds.")

    # Double check if all selected features were actually generated
    missing_features = [f for f in SELECTED_FEATURE_NAMES if f not in df_features.columns]
    if missing_features:
        print(f"WARNING: The following expected features were NOT generated: {missing_features}")
        # Update the list to only include features that are actually present
        CURRENT_FEATURE_COLS = [f for f in SELECTED_FEATURE_NAMES if f in df_features.columns]
        print(f"Using {len(CURRENT_FEATURE_COLS)} available features for modeling.")
    else:
        CURRENT_FEATURE_COLS = SELECTED_FEATURE_NAMES
        print(f"All {len(CURRENT_FEATURE_COLS)} selected features generated successfully.")


    print("\n--- 3. Data Cleaning (Post-Features) ---")
    # Replace inf/-inf with NaN (should already be done in function, but safety check)
    df_features = df_features.replace([np.inf, -np.inf], np.nan)

    # Drop rows where OHLC is NaN (should be handled in function)
    # df_features = df_features.dropna(subset=['open', 'high', 'low', 'close'])

    # Handle NaNs: XGBoost 'hist' tree method handles NaNs internally.
    # Optional: Drop rows missing a *critical* feature if XGBoost handling isn't desired for it.
    # Example: df = df_features.dropna(subset=['some_critical_feature'])
    # We will proceed assuming XGBoost handles NaNs in CURRENT_FEATURE_COLS

    # Final check for NaNs in feature columns (just for info)
    nan_check = df_features[CURRENT_FEATURE_COLS].isnull().sum()
    total_nans = nan_check.sum()
    if total_nans > 0:
        print(f"Total NaNs found in feature columns: {total_nans} (XGBoost 'hist' method will handle them).")
        # print(nan_check[nan_check > 0]) # Uncomment to see counts per feature
    else:
        print("No NaNs found in feature columns.")


    print("\n--- 4. Modeling Target & Final Prep ---")
    TARGET_COLUMN = 'target'
    df = df_features.copy() # Use df from now on

    # Create binary target
    print(f"Creating binary target '{TARGET_COLUMN}' ({TEST_WINDOW_HOURS}-hour return >= {TARGET_THRESHOLD_PCT}%)...")
    df = df.sort_values('timestamp') # Sort by time is crucial for shift
    df['future_price'] = df['close'].shift(-TEST_WINDOW_ROWS)
    df['price_return_future'] = (df['future_price'] - df['close']) / df['close'] * 100
    df['target'] = (df['price_return_future'] >= TARGET_THRESHOLD_PCT).astype(int)
    df = df.drop(['future_price', 'price_return_future'], axis=1)

    target_nan_count = df['target'].isna().sum()
    print(f"NaN values in target before drop: {target_nan_count} (Expected: ~{TEST_WINDOW_ROWS})")
    df = df.dropna(subset=[TARGET_COLUMN]) # Drop rows where target cannot be calculated
    print(f"Rows after removing NaN targets: {len(df)}")
    if df.empty: exit("DataFrame empty after target creation/NaN drop. Exiting.")

    target_counts = df[TARGET_COLUMN].value_counts(normalize=True) * 100
    print("\nTarget variable distribution:")
    print(f"  0 (< {TARGET_THRESHOLD_PCT}% return): {target_counts.get(0, 0):.2f}%")
    print(f"  1 (>= {TARGET_THRESHOLD_PCT}% return): {target_counts.get(1, 0):.2f}%")

    df = df.sort_values('timestamp').reset_index(drop=True)
    print(f"Final DataFrame shape for backtesting: {df.shape}")
    print(f"Number of features for modeling: {len(CURRENT_FEATURE_COLS)}")


    # --- 5. Walk-Forward Validation with Threshold Tuning ---
    print("\n--- 5. Starting Walk-Forward Validation ---")
    all_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    all_best_thresholds = []
    feature_importances_gain = {feature: [] for feature in CURRENT_FEATURE_COLS}
    iteration_count = 0

    n_rows_total = len(df)
    current_train_start_idx = 0
    total_iterations_estimate = max(0, (n_rows_total - TRAIN_WINDOW_ROWS - TEST_WINDOW_ROWS) // STEP_ROWS + 1)
    print(f"Total rows: {n_rows_total}, Train Window: {TRAIN_WINDOW_ROWS} ({TRAIN_WINDOW_HOURS}h), Test Window: {TEST_WINDOW_ROWS} ({TEST_WINDOW_HOURS}h), Step: {STEP_ROWS} ({STEP_HOURS}h)")
    print(f"Estimated iterations: {total_iterations_estimate}")
    print(f"Inner CV Grid: {INNER_CV_PARAM_GRID}")
    print(f"Threshold Search Range: {THRESHOLD_SEARCH_RANGE}")
    print("-" * 30)
    start_loop_time = time.time()

    while True:
        # Define Window Boundaries
        train_end_idx = current_train_start_idx + TRAIN_WINDOW_ROWS
        test_start_idx = train_end_idx
        test_end_idx = test_start_idx + TEST_WINDOW_ROWS

        # Boundary Checks
        if test_end_idx > n_rows_total:
            print(f"\nStopping: Test window end index ({test_end_idx}) exceeds total rows ({n_rows_total}).")
            break
        if current_train_start_idx >= n_rows_total:
             print(f"\nStopping: Train window start index ({current_train_start_idx}) reached end.")
             break

        # Data Slicing
        train_df = df.iloc[current_train_start_idx : train_end_idx]
        test_df = df.iloc[test_start_idx : test_end_idx]

        # Data Validity Checks
        min_train_samples = max(50, int(0.1 * TRAIN_WINDOW_ROWS))
        min_test_samples = 5
        if len(train_df) < min_train_samples or len(test_df) < min_test_samples:
            print(f"Skipping iter {iteration_count + 1}: Insufficient data train ({len(train_df)}/{min_train_samples}) or test ({len(test_df)}/{min_test_samples}).")
            current_train_start_idx += STEP_ROWS
            continue

        X_train_full = train_df[CURRENT_FEATURE_COLS]
        y_train_full = train_df[TARGET_COLUMN]
        X_test = test_df[CURRENT_FEATURE_COLS]
        y_test = test_df[TARGET_COLUMN]

        # Check for Single Class in Train/Test
        train_counts = y_train_full.value_counts()
        test_counts = y_test.value_counts()
        if len(train_counts) < 2:
            print(f"Skipping iter {iteration_count + 1}: Full training data has only one class ({train_counts.index.tolist()}).")
            current_train_start_idx += STEP_ROWS
            continue
        # Warning for single class in test set is acceptable

        # Calculate scale_pos_weight for class imbalance
        neg_count = train_counts.get(0, 0)
        pos_count = train_counts.get(1, 0)
        scale_pos_weight_val = neg_count / pos_count if pos_count > 0 else 1.0

        # --- Inner Loop: Hyperparameter Tuning ---
        iter_start_time = time.time()
        best_inner_cv_score = -np.inf # Using F1 for inner CV evaluation
        best_params_iter = None
        best_model_for_thresholding = None # Store the best model from inner CV

        val_size = int(len(X_train_full) * INNER_CV_VALIDATION_PCT)
        if val_size < min_test_samples or (len(X_train_full) - val_size) < min_test_samples:
            print(f"Warning iter {iteration_count + 1}: Train subset or Val set too small for Inner CV. Using first param combo.")
            best_params_iter = list(ParameterGrid(INNER_CV_PARAM_GRID))[0]
        else:
            X_train_sub = X_train_full[:-val_size]
            y_train_sub = y_train_full[:-val_size]
            X_val = X_train_full[-val_size:]
            y_val = y_train_full[-val_size:]

            if len(y_val.unique()) < 2 or len(y_train_sub.unique()) < 2:
                print(f"Warning iter {iteration_count + 1}: Inner train ({len(y_train_sub.unique())} classes) or val ({len(y_val.unique())} classes) has only one class. Skipping Inner CV params search.")
                best_params_iter = list(ParameterGrid(INNER_CV_PARAM_GRID))[0]
            else:
                # Iterate through the small grid for hyperparameters
                for params_cv in ParameterGrid(INNER_CV_PARAM_GRID):
                    try:
                        current_cv_params = {**XGB_FIXED_PARAMS, **params_cv}
                        model_cv = XGBClassifier(**current_cv_params,
                                                 scale_pos_weight=scale_pos_weight_val)
                        model_cv.fit(X_train_sub, y_train_sub,
                                     eval_set=[(X_val, y_val)], # Use validation set for early stopping
                                     early_stopping_rounds=10, # Stop if validation score doesn't improve
                                     verbose=False)

                        # Evaluate on validation set using default 0.5 threshold for hyperparam selection
                        y_pred_val_cv = model_cv.predict(X_val)
                        val_score = f1_score(y_val, y_pred_val_cv, average='binary', pos_label=1, zero_division=0)

                        if val_score > best_inner_cv_score:
                            best_inner_cv_score = val_score
                            best_params_iter = params_cv
                            best_model_for_thresholding = model_cv # Store the actual best model

                    except Exception as e_cv:
                        print(f"Error during inner CV iter {iteration_count + 1} with params {params_cv}: {e_cv}")
                        # Fallback if error occurs during CV
                        if best_params_iter is None: best_params_iter = list(ParameterGrid(INNER_CV_PARAM_GRID))[0]

        # Handle case where inner CV failed entirely or was skipped
        if best_params_iter is None:
            best_params_iter = list(ParameterGrid(INNER_CV_PARAM_GRID))[0] # Default to first combo

        # --- Probability Threshold Tuning ---
        best_threshold_iter = 0.5 # Default threshold
        best_thresh_f1_score = -np.inf

        # Ensure we have a validation set and a model to use
        if val_size >= min_test_samples and best_model_for_thresholding is not None and len(y_val.unique()) == 2:
            try:
                # Get probabilities on the validation set from the best inner CV model
                y_pred_proba_val = best_model_for_thresholding.predict_proba(X_val)[:, 1]

                # Iterate through potential thresholds
                for t in THRESHOLD_SEARCH_RANGE:
                    y_pred_val_t = (y_pred_proba_val >= t).astype(int)
                    current_f1 = f1_score(y_val, y_pred_val_t, average='binary', pos_label=1, zero_division=0)

                    if current_f1 > best_thresh_f1_score:
                        best_thresh_f1_score = current_f1
                        best_threshold_iter = t # Found a better threshold

            except Exception as e_thresh:
                 print(f"Error during threshold tuning iter {iteration_count + 1}: {e_thresh}. Using default threshold 0.5.")
                 best_threshold_iter = 0.5 # Revert to default on error
        else:
             print(f"Skipping threshold tuning iter {iteration_count + 1} (validation set issue or model unavailable). Using default 0.5.")
             best_threshold_iter = 0.5

        all_best_thresholds.append(best_threshold_iter)


        # --- Train Final Model for this Iteration ---
        final_iter_params = {**XGB_FIXED_PARAMS, **best_params_iter}

        # Print progress less frequently
        if (iteration_count == 0) or ((iteration_count + 1) % 20 == 0) or (test_end_idx >= n_rows_total):
             print(f"\nIter {iteration_count + 1}/{total_iterations_estimate}: "
                   f"Train Indices [{current_train_start_idx}:{train_end_idx-1}], "
                   f"Test Indices [{test_start_idx}:{test_end_idx-1}]")
             print(f"  Best Params: {best_params_iter}")
             print(f"  Best Threshold (F1 on Val): {best_threshold_iter:.2f} (Val F1: {best_thresh_f1_score:.4f})")

        current_model = XGBClassifier(**final_iter_params,
                                      scale_pos_weight=scale_pos_weight_val)

        try:
            # Train on the FULL training set for this window
            current_model.fit(X_train_full, y_train_full, verbose=False)

            # --- Prediction and Evaluation using Tuned Threshold ---
            # 1. Get probabilities on the test set
            y_pred_proba_test = current_model.predict_proba(X_test)[:, 1]

            # 2. Apply the best threshold found for this fold
            y_pred = (y_pred_proba_test >= best_threshold_iter).astype(int)

            # 3. Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            # Use zero_division=0 to avoid warnings when a class isn't predicted
            precision = precision_score(y_test, y_pred, average='binary', pos_label=1, zero_division=0)
            recall = recall_score(y_test, y_pred, average='binary', pos_label=1, zero_division=0)
            f1 = f1_score(y_test, y_pred, average='binary', pos_label=1, zero_division=0)

            all_metrics['accuracy'].append(accuracy)
            all_metrics['precision'].append(precision)
            all_metrics['recall'].append(recall)
            all_metrics['f1'].append(f1)

            # --- Store Feature Importances ---
            try:
                fold_importances = current_model.get_booster().get_score(importance_type='gain')
                for feature in CURRENT_FEATURE_COLS:
                    feature_importances_gain[feature].append(fold_importances.get(feature, 0))
            except Exception as e_imp:
                print(f"Warning: Could not get feature importance for iter {iteration_count + 1}: {e_imp}")
                for feature in CURRENT_FEATURE_COLS: feature_importances_gain[feature].append(np.nan)

            iteration_count += 1 # Increment successful iteration count

        except Exception as model_err:
            print(f"ERROR during model training or prediction in Iter {iteration_count + 1}: {model_err}")
            print(f"  Train shape: {X_train_full.shape}, Test shape: {X_test.shape}")
            print(f"  Train classes: {y_train_full.value_counts().to_dict()}, Test classes: {y_test.value_counts().to_dict()}")
            # Add NaN to metrics/importances if fold failed
            for key in all_metrics: all_metrics[key].append(np.nan)
            for feature in CURRENT_FEATURE_COLS: feature_importances_gain[feature].append(np.nan)
            # No need to append to all_best_thresholds here as it's found before final fit

        # --- Move to Next Window ---
        current_train_start_idx += STEP_ROWS


    end_loop_time = time.time()
    print("-" * 30)
    loop_duration_minutes = (end_loop_time - start_loop_time) / 60
    print(f"Walk-Forward Validation finished in {end_loop_time - start_loop_time:.2f} seconds ({loop_duration_minutes:.2f} minutes).")

    # --- 6. Aggregate and Display Results ---
    print("\n--- 6. Final Results ---")
    # Use nanmean/nanstd to handle potential NaNs from skipped/failed iterations
    if iteration_count > 0 and len(all_metrics['f1']) > 0:
        # Filter out potential NaN entries before calculating mean/std
        valid_f1 = [m for m in all_metrics['f1'] if not pd.isna(m)]
        if not valid_f1:
             print("\nNo valid metrics recorded (all iterations might have failed).")
        else:
            avg_accuracy = np.nanmean(all_metrics['accuracy'])
            avg_precision = np.nanmean(all_metrics['precision'])
            avg_recall = np.nanmean(all_metrics['recall'])
            avg_f1 = np.nanmean(valid_f1) # Use nanmean on the original list

            print("\n--- Average Walk-Forward Validation Results (with Threshold Tuning) ---")
            print(f"Total Folds Evaluated (Successful Iterations): {iteration_count}")
            print(f"Target Threshold: {TARGET_THRESHOLD_PCT}% increase over {TEST_WINDOW_HOURS} hours")
            print(f"Train Window: {TRAIN_WINDOW_HOURS}h, Test Window: {TEST_WINDOW_HOURS}h, Step: {STEP_HOURS}h")
            print(f"Average Accuracy:  {avg_accuracy:.4f}")
            print(f"Average Precision: {avg_precision:.4f}")
            print(f"Average Recall:    {avg_recall:.4f}")
            print(f"Average F1-Score:  {avg_f1:.4f}")

            # Avg threshold used
            avg_threshold = np.nanmean(all_best_thresholds)
            std_threshold = np.nanstd(all_best_thresholds)
            print(f"\nAverage Best Threshold Found (on validation set): {avg_threshold:.3f} (StdDev: {std_threshold:.3f})")


            std_accuracy = np.nanstd(all_metrics['accuracy'])
            std_precision = np.nanstd(all_metrics['precision'])
            std_recall = np.nanstd(all_metrics['recall'])
            std_f1 = np.nanstd(valid_f1)
            print("\n--- Standard Deviation of Metrics Across Folds ---")
            print(f"Std Dev Accuracy:  {std_accuracy:.4f}")
            print(f"Std Dev Precision: {std_precision:.4f}")
            print(f"Std Dev Recall:    {std_recall:.4f}")
            print(f"Std Dev F1-Score:  {std_f1:.4f}")

            # --- Display Feature Importances ---
            print("\n--- Average Feature Importances (Gain) ---")
            avg_importances = {}
            for f, imp_list in feature_importances_gain.items():
                valid_imps = [imp for imp in imp_list if not pd.isna(imp)]
                if valid_imps:
                    avg_importances[f] = np.mean(valid_imps)
                else:
                    avg_importances[f] = 0 # Assign 0 if no valid importances recorded

            sorted_importances = sorted(avg_importances.items(), key=lambda item: item[1], reverse=True)

            print("Top 20 Features:")
            for i, (feature, importance) in enumerate(sorted_importances[:20]):
                print(f"  {i+1}. {feature}: {importance:.4f}")

            zero_importance_features = [f for f, imp in avg_importances.items() if imp == 0]
            print(f"\nFeatures with Zero Average Importance ({len(zero_importance_features)}):")
            # print(sorted(zero_importance_features)) # Uncomment to list them

    else:
        print("\nNo iterations were successfully completed or no metrics were recorded.")

    print("\nScript finished.")