In [5]:
# B3_Configurable_Minute_SOL_Optuna_V4_TA_DebugNaN.py # <-- Renamed
# Adds TA features, adjusts target, includes NaN diagnostics.

import pandas as pd
import numpy as np
import time
import os
import warnings
import traceback
from datetime import datetime
import xgboost as xgb
import matplotlib.pyplot as plt
import optuna
import pandas_ta as ta

# Modeling Imports
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.exceptions import UndefinedMetricWarning

# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ==============================================================================
# --- Configuration ---
# ==============================================================================
CSV_FILE_PATH = 'SOL_minagg.csv'; SYMBOL_NAME = 'SOL'
PREDICTION_WINDOW_MINUTES = int(0.5 * 60); TARGET_THRESHOLD_PCT = -0.1
TRAIN_WINDOW_MINUTES = 6 * 60; STEP_MINUTES = 3 * 60; TEST_WINDOW_FRACTION = 0.2
XGB_FIXED_PARAMS = {"objective":"binary:logistic", "eval_metric":"logloss", "use_label_encoder": False, "random_state":42, "tree_method":"gpu_hist", "predictor":"gpu_predictor", "gpu_id":0, "n_jobs":-1, "n_estimators":150}
N_OPTUNA_TRIALS = 50; OPTUNA_CV_SPLITS = 3; OPTUNA_EVAL_THRESHOLD = 0.50
PROBABILITY_THRESHOLD_RANGE = (0.05, 0.95); PROBABILITY_THRESHOLD_STEP = 0.05

# ==============================================================================
# --- Derived Variables ---
# ==============================================================================
TEST_WINDOW_MINUTES = max(1, int(TEST_WINDOW_FRACTION * TRAIN_WINDOW_MINUTES))
THRESHOLD_SEARCH_RANGE = np.arange(PROBABILITY_THRESHOLD_RANGE[0], PROBABILITY_THRESHOLD_RANGE[1], PROBABILITY_THRESHOLD_STEP)
epsilon = 1e-9 # Define epsilon globally for reuse

# ==============================================================================
# --- Feature Engineering Functions (with initial NaN check) ---
# ==============================================================================
def garman_klass_volatility_min(o, h, l, c, window_min):
    with np.errstate(divide='ignore', invalid='ignore'): log_hl=np.log(h/l.replace(0, np.nan)); log_co=np.log(c/o.replace(0, np.nan))
    gk = 0.5*(log_hl**2) - (2*np.log(2)-1)*(log_co**2); gk = gk.fillna(0)
    min_p = max(1, window_min // 4); rm = gk.rolling(window_min, min_periods=min_p).mean(); rm = rm.clip(lower=0); return np.sqrt(rm)
def parkinson_volatility_min(h, l, window_min):
    with np.errstate(divide='ignore', invalid='ignore'): log_hl_sq = np.log(h/l.replace(0, np.nan))**2
    log_hl_sq = log_hl_sq.fillna(0); min_p = max(1, window_min // 4); rs = log_hl_sq.rolling(window_min, min_periods=min_p).sum()
    f = 1/(4*np.log(2)*window_min) if window_min>0 else 0; return np.sqrt(f*rs)

# ==============================================================================
# --- Feature Engineering Functions (with TA + NaN Fix + Diagnostics) ---
# ==============================================================================
# Define epsilon globally if not already done, or pass it
epsilon = 1e-9

def garman_klass_volatility_min(o, h, l, c, window_min):
    with np.errstate(divide='ignore', invalid='ignore'): log_hl=np.log(h/l.replace(0, np.nan)); log_co=np.log(c/o.replace(0, np.nan))
    gk = 0.5*(log_hl**2) - (2*np.log(2)-1)*(log_co**2); gk = gk.fillna(0)
    min_p = max(1, window_min // 4); rm = gk.rolling(window_min, min_periods=min_p).mean(); rm = rm.clip(lower=0); return np.sqrt(rm)

def parkinson_volatility_min(h, l, window_min):
    with np.errstate(divide='ignore', invalid='ignore'): log_hl_sq = np.log(h/l.replace(0, np.nan))**2
    log_hl_sq = log_hl_sq.fillna(0); min_p = max(1, window_min // 4); rs = log_hl_sq.rolling(window_min, min_periods=min_p).sum()
    f = 1/(4*np.log(2)*window_min) if window_min>0 else 0; return np.sqrt(f*rs)

def calculate_features_min_with_ta(df_input):
    """
    Calculates features including TA indicators (RSI, Stoch, MACD).
    Removed ma4h_div_ma24h due to missing prerequisite MA.
    """
    df = df_input.copy()
    print(f"  Feature Eng Start: Initial rows = {len(df)}")

    # --- Robust Initial NaN Check ---
    essential_cols = ['open', 'high', 'low', 'close', 'volumefrom']
    initial_nan_check = df[essential_cols].isnull().sum()
    print(f"  Initial NaNs check (essential cols):\n{initial_nan_check[initial_nan_check > 0]}")
    df = df.dropna(subset=essential_cols)
    print(f"  Rows after initial essential NaN drop: {len(df)}")
    if df.empty: print("  Error: Empty DF after essential NaN drop."); return df
    # --- End Initial Check ---

    # Ensure base columns are numeric
    base_cols_numeric = ['open', 'high', 'low', 'close', 'volumefrom', 'volumeto']
    for col in base_cols_numeric:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
        else: print(f"  Warning: Missing base column '{col}'"); df[col] = 0
    # Re-check NaNs just in case coerce created some
    df = df.dropna(subset=essential_cols)
    if df.empty: print("  Error: Empty DF after numeric conversion NaN drop."); return df

    # --- Feature Calculations ---
    df['price_change_1m_temp'] = df['close'].pct_change(periods=1)
    with np.errstate(divide='ignore', invalid='ignore'):
        df['price_range_pct'] = (df['high'] - df['low']) / df['close'].replace(0, np.nan) * 100
        df['oc_change_pct'] = (df['close'] - df['open']) / df['open'].replace(0, np.nan) * 100
    df['garman_klass_720m'] = garman_klass_volatility_min(df['open'], df['high'], df['low'], df['close'], 12 * 60)
    df['parkinson_180m'] = parkinson_volatility_min(df['high'], df['low'], 3 * 60)
    min_periods_rolling = 2
    df['ma_180m'] = df['close'].rolling(3 * 60, min_periods=max(min_periods_rolling, (3*60)//4)).mean()
    df['rolling_std_180m'] = df['price_change_1m_temp'].rolling(3 * 60, min_periods=max(min_periods_rolling, (3*60)//4)).std() * 100
    lag_periods_price_min = [5, 15, 30, 60, 120, 240]; lag_periods_volume_min = [5, 15, 30, 60]
    for lag in lag_periods_price_min: df[f'lag_{lag}m_price_return'] = df['price_change_1m_temp'].shift(lag) * 100
    df['volume_return_1m'] = df['volumefrom'].pct_change(periods=1).replace([np.inf, -np.inf], 0) * 100
    for lag in lag_periods_volume_min: df[f'lag_{lag}m_volume_return'] = df['volume_return_1m'].shift(lag)

    # MAs (Reduced longest to 24h = 1440m)
    ma_periods_min = [6*60, 12*60, 24*60]; min_p_long = 50
    for p in ma_periods_min: df[f'ma_{p}m'] = df['close'].rolling(p, min_periods=max(min_p_long, p//4)).mean()

    # Ratios using available MAs
    # --- REMOVED ma4h_div_ma24h calculation BLOCK ---

    # Keep close / 24h MA ratio
    if 'close' in df and 'ma_1440m' in df: # Check 'ma_1440m' exists now
        df['close_div_ma24h'] = df['close']/(df['ma_1440m'].replace(0, epsilon)+epsilon)
    else: df['close_div_ma24h']=np.nan

    # ATR (Reduced periods)
    df['prev_close']=df['close'].shift(1); df['hml']=df['high']-df['low']; df['hmpc']=np.abs(df['high']-df['prev_close']); df['lmpc']=np.abs(df['low']-df['prev_close'])
    df['tr']=df[['hml','hmpc','lmpc']].max(axis=1); atr_periods_min = [14*60]; min_p_atr = 20
    for p in atr_periods_min: df[f'atr_{p}m'] = df['tr'].rolling(p, min_periods=max(min_p_atr, p//4)).mean()
    df = df.drop(columns=['prev_close', 'hml', 'hmpc', 'lmpc', 'tr'], errors='ignore')

    # --- ADDING TA FEATURES ---
    print("  Calculating TA features (RSI, Stoch, MACD)...")
    try:
        min_ta_warmup = 30
        if len(df) < min_ta_warmup:
            print(f"  Warning: Insufficient data ({len(df)} rows) for TA warmup ({min_ta_warmup}). Skipping TA features.")
        else:
            # Calculate TA features only if enough data exists
            df.ta.rsi(length=14, append=True)
            df.ta.stoch(k=14, d=3, smooth_k=3, append=True)
            df.ta.macd(fast=12, slow=26, signal=9, append=True)

            # Create derived TA features only if base TA cols were created
            if 'RSI_14' in df.columns:
                df['rsi_14_oversold'] = (df['RSI_14'] < 30).astype(int)
                df['rsi_14_overbought'] = (df['RSI_14'] > 70).astype(int)
            if 'STOCHk_14_3_3' in df.columns:
                df['stoch_k_oversold'] = (df['STOCHk_14_3_3'] < 20).astype(int)
                df['stoch_k_overbought'] = (df['STOCHk_14_3_3'] > 80).astype(int)
            if 'MACDh_12_26_9' in df.columns:
                 df['macd_hist_positive'] = (df['MACDh_12_26_9'] > 0).astype(int)

            # Simple RSI Divergence Proxy
            if 'RSI_14' in df.columns: # Check RSI exists before calculating divergence
                for n_div in [30, 60]:
                    if len(df) > n_div:
                        min_price_n = df['low'].rolling(window=n_div, min_periods=n_div//2).min()
                        min_rsi_n = df['RSI_14'].rolling(window=n_div, min_periods=n_div//2).min()
                        price_lower_low = df['low'] < min_price_n.shift(1)
                        rsi_higher_low = df['RSI_14'] > min_rsi_n.shift(1)
                        df[f'rsi_bull_div_{n_div}m'] = (price_lower_low & rsi_higher_low).astype(int)
                    else:
                         df[f'rsi_bull_div_{n_div}m'] = 0

    except Exception as e_ta:
        print(f"!! Error calculating TA features: {e_ta}")
        # This might still leave NaNs if calculation failed mid-way

    # --- Cleanup Intermediate ---
    cols_to_drop_intermediate = ['price_change_1m_temp', 'volume_return_1m']
    df = df.drop(columns=[col for col in cols_to_drop_intermediate if col in df.columns], errors='ignore')

    # Final check for infinities
    numeric_cols = df.select_dtypes(include=np.number).columns
    if df[numeric_cols].isin([np.inf, -np.inf]).any().any():
        print("  Warning: Infinities detected after feature calculation. Replacing with NaN.")
        df = df.replace([np.inf, -np.inf], np.nan)

    print(f"  Feature Eng End: Total columns = {df.shape[1]}, Rows = {len(df)}")
    return df

# ==============================================================================
# --- Optuna Objective Function (Unchanged)---
# ==============================================================================
def objective(trial, X, y, fixed_params, cv_strategy):
    param = {
        "max_depth":        trial.suggest_int("max_depth", 5, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "reg_alpha":        trial.suggest_float("reg_alpha", 1e-3, 0.5, log=True),
        "reg_lambda":       trial.suggest_float("reg_lambda", 1.0, 10.0, log=True),
        "gamma":            trial.suggest_float("gamma", 0, 0.5),
        "subsample":        trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "learning_rate":    trial.suggest_float("learning_rate", 0.03, 0.15, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 5.0),
    }
    xgb_params = {**fixed_params, **param}
    cv_scores = []
    try:
        y_np = y.to_numpy() if isinstance(y, pd.Series) else np.array(y)
        X_np = X.to_numpy() if isinstance(X, pd.DataFrame) else np.array(X)
        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X_np, y_np)): # Use the passed cv_strategy
            X_train_fold, X_val_fold = X_np[train_idx], X_np[val_idx]
            y_train_fold, y_val_fold = y_np[train_idx], y_np[val_idx]
            if len(np.unique(y_train_fold)) < 2: cv_scores.append(0.0); continue
            model = xgb.XGBClassifier(**xgb_params)
            model.fit(X_train_fold, y_train_fold, verbose=False)
            preds_proba = model.predict_proba(X_val_fold)[:, 1]
            preds_binary = (preds_proba >= OPTUNA_EVAL_THRESHOLD).astype(int)
            f1 = f1_score(y_val_fold, preds_binary, zero_division=0)
            cv_scores.append(f1)
            trial.report(f1, fold)
            if trial.should_prune(): raise optuna.exceptions.TrialPruned()
        average_f1 = np.mean(cv_scores) if cv_scores else 0.0
    except optuna.exceptions.TrialPruned: raise
    except Exception as e: print(f"Error in Optuna trial {trial.number}, fold {fold}: {e}"); return 0.0
    return average_f1 if not np.isnan(average_f1) else 0.0

# ==============================================================================
# --- Main Script Logic ---
# ==============================================================================

overall_start_time = time.time()

# --- 1. Load Data ---
print("--- Data Loading ---"); print(f"Loading data from: {CSV_FILE_PATH}...")
try:
    df_data = pd.read_csv(CSV_FILE_PATH, parse_dates=['timestamp'])
    df_data = df_data.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
    print(f"Loaded {len(df_data)} rows for {SYMBOL_NAME}.")
except Exception as e: print(f"Error loading data: {e}"); exit()
if len(df_data) < 3000: print(f"Warning: Initial data size ({len(df_data)}) might be small for long features/windows."); # Heuristic check
# Adjusted check for minimum data needed for at least one backtest step
min_data_needed = TRAIN_WINDOW_MINUTES + TEST_WINDOW_MINUTES + STEP_MINUTES # Rough minimum for one step
if len(df_data) < min_data_needed: print(f"Error: Insufficient initial data ({len(df_data)} < {min_data_needed} needed)."); exit()


# --- 2. Feature Engineering ---
print(f"\n--- Feature Engineering for {SYMBOL_NAME} ---")
start_fe = time.time()
df_data = calculate_features_min_with_ta(df_data) # Call the updated function
if df_data.empty: print(f"Error: Feature calculation resulted in empty DataFrame."); exit()
print(f"Feature engineering complete. Took {time.time() - start_fe:.2f}s.")

# --- 3. Define Target Variable ---
print("\n--- Target Definition ---")
print(f"Defining target: {PREDICTION_WINDOW_MINUTES}m return >= {TARGET_THRESHOLD_PCT}%...")
target_col = f'target_return_{PREDICTION_WINDOW_MINUTES}m'
df_data[target_col] = df_data['close'].shift(-PREDICTION_WINDOW_MINUTES).sub(df_data['close']).div(df_data['close'].replace(0, np.nan)).mul(100)
target_occurrence = df_data[target_col].notna() & (df_data[target_col] >= TARGET_THRESHOLD_PCT)
print(f"  Raw positive target occurrence (before NaN drop): {target_occurrence.mean()*100:.2f}%")

# --- 4. Prepare Data for Modeling ---
print("\n--- Data Preparation ---")
base_cols_ohlcv = ['open', 'high', 'low', 'close', 'volumefrom', 'volumeto']
cols_to_keep_final = ['timestamp', target_col]
potential_feature_cols = [col for col in df_data.columns if col not in cols_to_keep_final and col not in base_cols_ohlcv and not col.startswith('STOCHd') and not col.startswith('MACDs')] # Exclude some redundant TA cols if needed
numeric_feature_cols = df_data[potential_feature_cols].select_dtypes(include=np.number).columns.tolist()
final_feature_cols = numeric_feature_cols
if not final_feature_cols: print("Error: No numeric features found after selection."); exit()

cols_to_select = final_feature_cols + [col for col in cols_to_keep_final if col in df_data.columns]
df_model_ready = df_data[cols_to_select].copy()

# --- DIAGNOSE NANS ---
print("\n--- NaN Counts Before Final Drop ---")
nan_counts = df_model_ready[final_feature_cols + [target_col]].isnull().sum()
print(nan_counts[nan_counts > 0].sort_values(ascending=False))
print(f"Total rows before final drop: {len(df_model_ready)}")
# --- END DIAGNOSIS ---

print("\nApplying final NaN/Inf Handling...")
initial_rows = len(df_model_ready)
# Drop rows with NaNs in features OR target
df_model_ready = df_model_ready.dropna(subset=final_feature_cols + [target_col])
final_rows = len(df_model_ready)
print(f"NaN Handling: Dropped {initial_rows - final_rows} rows with remaining NaNs.")

# Final Inf check
if df_model_ready[final_feature_cols].isin([np.inf, -np.inf]).any().any():
    inf_count = df_model_ready[final_feature_cols].isin([np.inf, -np.inf]).sum().sum()
    print(f"Replacing {inf_count} final infinites...")
    df_model_ready.replace([np.inf, -np.inf], np.nan, inplace=True)
    rows_b4 = len(df_model_ready); df_model_ready = df_model_ready.dropna(subset=final_feature_cols)
    print(f"Dropped {rows_b4 - len(df_model_ready)} more rows after Inf handling.")

if df_model_ready.empty: print(f"Error: DataFrame empty after final NaN/Inf handling."); exit()

X = df_model_ready[final_feature_cols]
y_binary = (df_model_ready[target_col] >= TARGET_THRESHOLD_PCT).astype(int)
timestamps = df_model_ready['timestamp']

print(f"\nFinal feature matrix shape: {X.shape}, Target shape: {y_binary.shape}")
print(f"Using {len(final_feature_cols)} features.")
# print(f"Final Feature List: {final_feature_cols}") # Uncomment to verify features

# --- 5. SLIDING Window Backtesting ---
# ... (Rest of the script - backtesting loop, evaluation, plotting - remains unchanged from V3_TSS) ...
# ... it will use the new X and y_binary calculated above ...

print(f"\n--- Starting SLIDING Window Backtest for {SYMBOL_NAME} ---")
print(f"!!! Using Optuna (TimeSeriesSplit CV) + TA Features !!!") # <--- Updated print

if len(X) < TRAIN_WINDOW_MINUTES + STEP_MINUTES:
     print(f"Error: Not enough data after pre-processing ({len(X)} rows) for train window ({TRAIN_WINDOW_MINUTES}) + step ({STEP_MINUTES}).");
     exit()

all_predictions_proba = []; all_actual = []; backtest_timestamps = []
all_best_params = []
num_steps = 0
start_index_loop = TRAIN_WINDOW_MINUTES # Start after the first full training window
end_index_loop = len(X) - TEST_WINDOW_MINUTES + 1 # Ensure space for the last test window

print(f"Train Window: {TRAIN_WINDOW_MINUTES}m, Step: {STEP_MINUTES}m, Test Window: {TEST_WINDOW_MINUTES}m, Optuna Trials: {N_OPTUNA_TRIALS}, CV Splits: {OPTUNA_CV_SPLITS}")
loop_start_time = time.time()

for i in range(start_index_loop, end_index_loop, STEP_MINUTES):
    step_start_time = time.time()
    train_idx_start = i - TRAIN_WINDOW_MINUTES
    train_idx_end = i
    test_idx_start = i
    test_idx_end = min(i + TEST_WINDOW_MINUTES, len(X))

    if test_idx_start >= test_idx_end: print(f"Stopping loop: Test window invalid."); break

    X_train_roll = X.iloc[train_idx_start : train_idx_end]
    y_train_roll = y_binary.iloc[train_idx_start : train_idx_end]
    X_test_roll = X.iloc[test_idx_start : test_idx_end]
    y_test_roll_actual_series = y_binary.iloc[test_idx_start : test_idx_end]
    step_timestamps = timestamps.iloc[test_idx_start : test_idx_end]

    if y_test_roll_actual_series.empty: print(f"Warning: Step {i}, empty test actuals."); continue
    current_timestamp = step_timestamps.iloc[0]
    if X_train_roll.empty or len(np.unique(y_train_roll)) < 2: print(f"Warning: Step {i}, invalid training data."); continue

    print(f"\n--- Step {num_steps + 1} ({current_timestamp}) ---")
    print(f"  Train: [{train_idx_start}:{train_idx_end-1}]; Test: [{test_idx_start}:{test_idx_end-1}]")

    # --- Hyperparameter Tuning with Optuna ---
    print(f"  Running Optuna ({N_OPTUNA_TRIALS} trials, cv={OPTUNA_CV_SPLITS} TimeSeriesSplit, scoring F1@{OPTUNA_EVAL_THRESHOLD})...")
    optuna_start_time = time.time()
    try:
        cv_strategy = TimeSeriesSplit(n_splits=OPTUNA_CV_SPLITS)
        pruner = optuna.pruners.MedianPruner(n_warmup_steps=5, n_min_trials=10)
        sampler = optuna.samplers.TPESampler(seed=i) # Seed sampler based on step index
        study = optuna.create_study(direction='maximize', pruner=pruner, sampler=sampler)
        obj_func = lambda trial: objective(trial, X_train_roll, y_train_roll, XGB_FIXED_PARAMS, cv_strategy)

        study.optimize(obj_func, n_trials=N_OPTUNA_TRIALS, n_jobs=1, show_progress_bar=False)

        best_params_step = study.best_params
        best_score_step = study.best_value

        print(f"  Optuna finished in {time.time() - optuna_start_time:.2f}s.")
        print(f"  Best Params: {best_params_step}, Best CV F1(@{OPTUNA_EVAL_THRESHOLD}): {best_score_step:.4f}")
        all_best_params.append({'step': num_steps + 1, 'params': best_params_step, 'cv_f1': best_score_step, 'timestamp': current_timestamp})

        # --- Fit final model for the step ---
        final_model_params = {**XGB_FIXED_PARAMS, **best_params_step}
        model_roll = xgb.XGBClassifier(**final_model_params)
        model_roll.fit(X_train_roll, y_train_roll, verbose=False)

        # --- Predict probabilities for the test window ---
        prob_roll_window = model_roll.predict_proba(X_test_roll)[:, 1]

        # --- Store results ---
        all_predictions_proba.extend(prob_roll_window)
        all_actual.extend(y_test_roll_actual_series.tolist())
        backtest_timestamps.extend(step_timestamps.tolist())
        num_steps += 1

    except ValueError as ve: print(f"!! Value Error at step {i}: {ve}"); continue
    except Exception as e_step: print(f"!! Error at step {i}: {e_step}"); traceback.print_exc(); continue

    step_end_time = time.time()
    print(f"  Step {num_steps} finished in {step_end_time - step_start_time:.2f}s total.")

loop_end_time = time.time()
print(f"\nBacktesting loop finished. Completed {num_steps} steps in {(loop_end_time - loop_start_time)/60:.2f} minutes.")

# --- 6. Evaluate Backtesting Results with PTT ---
if num_steps > 0 and len(all_predictions_proba) == len(all_actual) and len(all_predictions_proba) == len(backtest_timestamps):
    print(f"\n--- Evaluating Results for {SYMBOL_NAME} with Probability Threshold Tuning ---")
    print(f"Threshold search range: {THRESHOLD_SEARCH_RANGE}")
    best_threshold = 0.5; best_f1_thresh = -1.0
    results_per_threshold = {}
    probabilities_np = np.array(all_predictions_proba)
    actual_np = np.array(all_actual)

    for t in THRESHOLD_SEARCH_RANGE:
        predictions_thresh = (probabilities_np >= t).astype(int)
        if np.sum(actual_np) == 0 and np.sum(predictions_thresh) == 0: acc_t, pre_t, rec_t, f1_t = 1.0, 1.0, 1.0, 1.0
        elif np.sum(actual_np) > 0 and np.sum(predictions_thresh) == 0: acc_t = accuracy_score(actual_np, predictions_thresh); pre_t, rec_t, f1_t = 0.0, 0.0, 0.0
        elif np.sum(actual_np) == 0 and np.sum(predictions_thresh) > 0: acc_t = accuracy_score(actual_np, predictions_thresh); pre_t, rec_t, f1_t = 0.0, 0.0, 0.0
        else:
             acc_t = accuracy_score(actual_np, predictions_thresh); pre_t = precision_score(actual_np, predictions_thresh, zero_division=0)
             rec_t = recall_score(actual_np, predictions_thresh, zero_division=0); f1_t = f1_score(actual_np, predictions_thresh, zero_division=0)
        results_per_threshold[round(t, 2)] = {'f1': f1_t, 'acc': acc_t, 'pre': pre_t, 'rec': rec_t}
        if f1_t >= best_f1_thresh:
             if f1_t > best_f1_thresh or abs(t - 0.5) < abs(best_threshold - 0.5): best_f1_thresh = f1_t; best_threshold = t

    print(f"\nBest Threshold for {SYMBOL_NAME} found: {best_threshold:.2f} (Yielding F1 Score: {best_f1_thresh:.4f})")
    final_predictions_optimized = (probabilities_np >= best_threshold).astype(int)
    final_accuracy = accuracy_score(actual_np, final_predictions_optimized)
    final_precision = precision_score(actual_np, final_predictions_optimized, zero_division=0)
    final_recall = recall_score(actual_np, final_predictions_optimized, zero_division=0)
    final_f1 = f1_score(actual_np, final_predictions_optimized, zero_division=0)

    print(f"\n--- Final Performance Metrics for {SYMBOL_NAME} (Optimized Threshold) ---")
    print(f"Target: {PREDICTION_WINDOW_MINUTES}m return >= {TARGET_THRESHOLD_PCT}%")
    print(f"Windowing: Train={TRAIN_WINDOW_MINUTES}m, Step={STEP_MINUTES}m, Test Window={TEST_WINDOW_MINUTES}m")
    print(f"Hyperparameter Tuning: Optuna ({N_OPTUNA_TRIALS} trials/step, TimeSeriesSplit CV, F1@{OPTUNA_EVAL_THRESHOLD})") # Updated print
    print(f"Total Individual Predictions Evaluated: {len(actual_np)}")
    print(f"Positive Target Occurrence (final eval set): {actual_np.mean()*100:.2f}%") # Occurrence in the actual evaluated set
    print(f"Overall Accuracy:  {final_accuracy:.4f}")
    print(f"Overall Precision: {final_precision:.4f}")
    print(f"Overall Recall:    {final_recall:.4f}")
    print(f"Overall F1 Score:  {final_f1:.4f}")
    if OPTUNA_EVAL_THRESHOLD in results_per_threshold:
        res_eval = results_per_threshold[OPTUNA_EVAL_THRESHOLD]
        print(f"(Compare: Optuna Eval {OPTUNA_EVAL_THRESHOLD} Thresh -> F1:{res_eval['f1']:.4f}, Acc:{res_eval['acc']:.4f}, Pre:{res_eval['pre']:.4f}, Rec:{res_eval['rec']:.4f})")
    elif 0.5 in results_per_threshold:
        res_def = results_per_threshold[0.5]
        print(f"(Compare: Default 0.5 Thresh -> F1:{res_def['f1']:.4f}, Acc:{res_def['acc']:.4f}, Pre:{res_def['pre']:.4f}, Rec:{res_def['rec']:.4f})")

    results_summary = { # Store results
        'symbol': SYMBOL_NAME, 'probabilities': probabilities_np, 'actuals': actual_np, 'timestamps': backtest_timestamps,
        'best_threshold': best_threshold, 'metrics_optimized': {'acc': final_accuracy, 'pre': final_precision, 'rec': final_recall, 'f1': final_f1},
        'metrics_default_0.5': results_per_threshold.get(0.5, {}), 'best_params_per_step': all_best_params, 'results_per_threshold': results_per_threshold
    }

    # --- 7. Plot Cumulative Accuracy ---
    print(f"\nPlotting cumulative accuracy for {SYMBOL_NAME} (optimized threshold)...")
    try:
        cumulative_accuracy_list_optimized = (np.cumsum(final_predictions_optimized == actual_np) / np.arange(1, len(actual_np) + 1))
        plt.figure(figsize=(14, 7))
        plt.plot(backtest_timestamps, cumulative_accuracy_list_optimized, marker='.', linestyle='-', markersize=1, alpha=0.7, label=f'Cumulative Accuracy ({SYMBOL_NAME})')
        rolling_window_plot_size = max(TEST_WINDOW_MINUTES * 5, 60 * 12)
        if len(actual_np) > rolling_window_plot_size:
             results_df = pd.DataFrame({'correct': (final_predictions_optimized == actual_np).astype(int)}, index=pd.to_datetime(backtest_timestamps))
             try: rolling_acc = results_df['correct'].rolling(window=rolling_window_plot_size, min_periods=rolling_window_plot_size//2).mean()
             except Exception as e_roll: print(f"Could not calculate rolling accuracy: {e_roll}"); rolling_acc = None
             if rolling_acc is not None: plt.plot(rolling_acc.index, rolling_acc, linestyle='--', color='red', label=f'Rolling Acc ({rolling_window_plot_size} min window)')
        plt.title(f'{SYMBOL_NAME} Backtest (Optuna TSS CV + TA Features, Train:{TRAIN_WINDOW_MINUTES}m) - Best Thresh: {best_threshold:.2f}') # Update title
        plt.xlabel('Timestamp'); plt.ylabel('Accuracy'); min_y_plot=max(0.0, np.min(cumulative_accuracy_list_optimized)-0.05 if len(cumulative_accuracy_list_optimized)>0 else 0.4)
        max_y_plot=min(1.0, np.max(cumulative_accuracy_list_optimized)+0.05 if len(cumulative_accuracy_list_optimized)>0 else 0.8)
        if max_y_plot - min_y_plot < 0.1: mid_point=(max_y_plot+min_y_plot)/2; min_y_plot=max(0.0, mid_point-0.05); max_y_plot=min(1.0, mid_point+0.05)
        plt.ylim(min_y_plot, max_y_plot); plt.grid(True, linestyle='--', alpha=0.6); plt.legend(); plt.xticks(rotation=30, ha='right'); plt.tight_layout()
        plot_filename = f"backtest_accuracy_{SYMBOL_NAME}_2h_target_Optuna_TSS_TA.png" # Update filename for new target/features
        plt.savefig(plot_filename); print(f"Saved accuracy plot to {plot_filename}"); plt.close()
    except Exception as e_plot: print(f"Error plotting: {e_plot}")

elif num_steps == 0: print(f"No backtesting steps completed.")
else: print(f"Error: Length mismatch in results arrays. Cannot evaluate.")

# --- End of Script ---
print(f"\n{'='*30} Overall Script Finished for {SYMBOL_NAME} {'='*30}")
overall_end_time = time.time()
print(f"Total execution time: {(overall_end_time - overall_start_time)/60:.2f} minutes.")

--- Data Loading ---
Loading data from: SOL_minagg.csv...
Loaded 22582 rows for SOL.

--- Feature Engineering for SOL ---
  Feature Eng Start: Initial rows = 22582
  Initial NaNs check (essential cols):
Series([], dtype: int64)
  Rows after initial essential NaN drop: 22582
  Calculating TA features (RSI, Stoch, MACD)...
  Feature Eng End: Total columns = 41, Rows = 22582
Feature engineering complete. Took 0.06s.

--- Target Definition ---
Defining target: 30m return >= -0.1%...
  Raw positive target occurrence (before NaN drop): 57.66%

--- Data Preparation ---

--- NaN Counts Before Final Drop ---
close_div_ma24h          359
ma_1440m                 359
lag_240m_price_return    241
atr_840m                 209
garman_klass_720m        179
ma_720m                  179
lag_120m_price_return    121
ma_360m                   89
lag_60m_price_return      61
lag_60m_volume_return     61
rolling_std_180m          45
ma_180m                   44
parkinson_180m            44
MACDh_12_26_9   

[W 2025-04-18 15:48:43,508] Trial 46 failed with parameters: {'max_depth': 9, 'min_child_weight': 3, 'reg_alpha': 0.030494571003714478, 'reg_lambda': 2.633715754574309, 'gamma': 0.19958191759985042, 'subsample': 0.8072583968814686, 'colsample_bytree': 0.7781758716355935, 'learning_rate': 0.03526870305067796, 'scale_pos_weight': 1.6029533048512334} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\mason\AppData\Roaming\Python\Python312\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\mason\AppData\Local\Temp\ipykernel_2496\48757139.py", line 349, in <lambda>
    obj_func = lambda trial: objective(trial, X_train_roll, y_train_roll, XGB_FIXED_PARAMS, cv_strategy)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mason\AppData\Local\Temp\ipykernel_2496\48757139.py"

KeyboardInterrupt: 

In [4]:
X.columns

Index(['open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD',
       'price_range_pct', 'oc_change_pct', 'garman_klass_12h', 'parkinson_3h',
       'ma_3h', 'rolling_std_3h', 'lag_3h_price_return', 'lag_6h_price_return',
       'lag_12h_price_return', 'lag_24h_price_return', 'lag_48h_price_return',
       'lag_72h_price_return', 'lag_168h_price_return', 'volume_return_1h',
       'lag_3h_volume_return', 'lag_6h_volume_return', 'lag_12h_volume_return',
       'lag_24h_volume_return', 'ma_6h', 'ma_12h', 'ma_24h', 'ma_48h',
       'ma_72h', 'ma_168h', 'rolling_std_6h', 'rolling_std_12h',
       'rolling_std_24h', 'rolling_std_48h', 'rolling_std_72h',
       'rolling_std_168h', 'atr_14h', 'atr_24h', 'atr_48h', 'close_div_ma_24h',
       'close_div_ma_48h', 'close_div_ma_168h', 'ma12_div_ma48',
       'ma24_div_ma168', 'std12_div_std72', 'volume_btc_x_range',
       'rolling_std_3h_sq', 'price_return_1h_sq', 'rolling_std_12h_sqrt'],
      dtype='object')