In [1]:
import pandas as pd
import numpy as np
import time
import os
import warnings
from datetime import datetime
import pandas_ta as ta # Make sure this is imported if needed by calculate_selected_features
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore') # General suppression

# --- Configuration ---

# Data Loading
CSV_FILE_PATH = r'C:\Users\mason\AVP\BTCUSDrec.csv' # Use raw string for Windows paths
SYMBOL_NAME = 'BTCUSD' # Define the symbol represented in the CSV

# Feature Selection List (The 123 features you want to filter FROM)
# This list should ideally be identical to the one used in your main script's config
SELECTED_FEATURE_NAMES_INPUT = [
    'Volume BTC_x_rolling_std_168h', 'Volume BTC_x_std12_div_std72', 'Volume USD',
    'cci_20h_sq', 'cci_20h_x_Volume BTC', 'cci_20h_x_cmf_20h',
    'cci_20h_x_lag_24h_volume_return', 'cci_20h_x_lag_3h_volume_return',
    'cci_20h_x_rolling_kurt_24h', 'cci_20h_x_rolling_std_168h',
    'cci_20h_x_rolling_std_48h', 'cci_20h_x_std12_div_std72',
    'cci_20h_x_volume_div_ma_24h', 'cci_20h_x_volume_ma_168h',
    'close_pos_in_range', 'cmf_20h', 'cmf_20h_x_bband_width_20h',
    'cmf_20h_x_rolling_kurt_24h', 'cmf_20h_x_rolling_std_168h',
    'cmf_20h_x_rolling_std_6h', 'cmf_20h_x_std12_div_std72', 'day_0', 'day_1',
    'day_2', 'day_4', 'day_5', 'day_6', 'hour_0', 'hour_1', 'hour_10',
    'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
    'hour_17', 'hour_18', 'hour_19', 'hour_2', 'hour_20', 'hour_21',
    'hour_22', 'hour_23', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_8',
    'hour_9', 'lag_12h_price_return_sq', 'lag_12h_price_return_x_cmf_20h',
    'lag_12h_price_return_x_rolling_kurt_24h',
    'lag_12h_price_return_x_rolling_std_168h',
    'lag_12h_price_return_x_volume_ma_168h', 'lag_12h_volume_return',
    'lag_168h_price_return_sq', 'lag_168h_price_return_x_Volume BTC',
    'lag_168h_price_return_x_bband_width_20h', 'lag_168h_price_return_x_cmf_20h',
    'lag_168h_price_return_x_rolling_kurt_24h',
    'lag_168h_price_return_x_rolling_std_168h',
    'lag_168h_price_return_x_rolling_std_6h',
    'lag_168h_price_return_x_std12_div_std72',
    'lag_168h_price_return_x_volume_div_ma_24h',
    'lag_168h_price_return_x_volume_ma_168h', 'lag_24h_price_return_sq',
    'lag_24h_price_return_x_lag_12h_volume_return',
    'lag_24h_price_return_x_lag_6h_volume_return',
    'lag_24h_price_return_x_rolling_std_168h',
    'lag_24h_price_return_x_volume_ma_168h',
    'lag_24h_volume_return_x_rolling_std_168h',
    'lag_24h_volume_return_x_std12_div_std72',
    'lag_3h_volume_return_x_rolling_kurt_24h', 'lag_48h_price_return_sq',
    'lag_48h_price_return_x_lag_24h_volume_return',
    'lag_48h_price_return_x_rolling_std_168h',
    'lag_48h_price_return_x_volume_div_ma_24h',
    'lag_48h_price_return_x_volume_ma_168h',
    'lag_6h_volume_return_x_rolling_kurt_24h', 'lag_72h_price_return_sq',
    'lag_72h_price_return_x_cmf_20h',
    'lag_72h_price_return_x_lag_12h_volume_return',
    'lag_72h_price_return_x_lag_3h_volume_return',
    'lag_72h_price_return_x_lag_6h_volume_return',
    'lag_72h_price_return_x_rolling_kurt_24h',
    'lag_72h_price_return_x_rolling_std_168h',
    'lag_72h_price_return_x_volume_ma_168h', 'macd_hist_sq',
    'macd_hist_x_Volume BTC', 'macd_hist_x_cmf_20h',
    'macd_hist_x_rolling_kurt_24h', 'macd_hist_x_rolling_std_6h',
    'macd_hist_x_volume_div_ma_24h', 'macd_hist_x_volume_ma_12h',
    'macd_signal_sq', 'macd_signal_x_cmf_20h',
    'macd_signal_x_lag_24h_volume_return', 'macd_signal_x_rolling_kurt_24h',
    'macd_signal_x_rolling_std_48h', 'macd_signal_x_rolling_std_6h',
    'macd_signal_x_std12_div_std72', 'macd_signal_x_volume_div_ma_24h',
    'macd_signal_x_volume_ma_12h', 'rolling_kurt_24h', 'rolling_skew_24h',
    'rolling_std_168h', 'rolling_std_3h_sq',
    'rolling_std_6h_div_rolling_std_48h', 'std12_div_std72',
    'volume_btc_x_range', 'volume_btc_x_range_log1p', 'volume_div_ma_24h_sq',
    'volume_div_ma_24h_x_rolling_kurt_24h',
    'volume_div_ma_24h_x_rolling_std_168h',
    'volume_ma_12h_x_bband_width_20h', 'volume_ma_12h_x_rolling_kurt_24h',
    'volume_ma_12h_x_rolling_std_6h', 'volume_ma_168h_x_rolling_kurt_24h',
    'volume_ma_168h_x_rolling_std_48h', 'volume_ma_168h_x_std12_div_std72',
    'volume_return_1h', 'volume_return_1h_x_rolling_kurt_24h'
]

# VIF Threshold
VIF_THRESHOLD = 1.73

# --- Feature Engineering Function (Paste your updated 123-feature version here) ---
# --- Needs to be defined before being called in __main__                   ---
def calculate_selected_features(df, symbol):
    """Calculates only the 123 pre-selected features and their prerequisites."""
    print("Starting calculation of selected 123 features...")
    start_time = time.time()
    if df is None or len(df) < 3:
        print("Error: Input DataFrame is None or too small.")
        return pd.DataFrame()
    df = df.copy()
    df['symbol'] = symbol

    # --- Timestamp and Index ---
    if 'timestamp' not in df.columns:
        print("Error: 'timestamp' column not found.")
        return pd.DataFrame()
    try:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    except Exception as e:
        print(f"Error converting timestamp to datetime: {e}")
        return pd.DataFrame()
    df = df.sort_values('timestamp').dropna(subset=['timestamp'])
    df = df.set_index('timestamp', drop=False)

    # --- Volume Columns (Standardize internal naming) ---
    # Use original names from CSV loading step for consistency with feature list
    if 'Volume BTC' in df.columns:
        df['volume_btc'] = df['Volume BTC'] # Internal standard name
    elif 'volume_btc' in df.columns:
        df['Volume BTC'] = df['volume_btc'] # Ensure original name exists if passed
    else:
        df['volume_btc'] = 0
        df['Volume BTC'] = 0 # Add original name column if missing
    if 'Volume USD' in df.columns:
        # Keep 'Volume USD' as it is requested directly
        pass
    elif 'volume_usd' in df.columns:
         df['Volume USD'] = df['volume_usd'] # Ensure original name exists if passed
    else:
        df['Volume USD'] = 0 # Add original name column if missing


    # --- Basic Checks (OHLC) ---
    required_ohlc = ['open', 'high', 'low', 'close']
    all_ohlc_present = True
    for col in required_ohlc:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found.")
            all_ohlc_present = False
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    if not all_ohlc_present: return pd.DataFrame()

    if df[required_ohlc].isnull().any().any():
        print(f"Warning: NaNs found in required OHLC columns. Rows before drop: {len(df)}")
        df = df.dropna(subset=required_ohlc)
        print(f"Rows after dropping NaNs in OHLC: {len(df)}")
    if df.empty:
        print("Error: DataFrame empty after dropping OHLC NaNs.")
        return pd.DataFrame()

    # --- 1. Calculate Prerequisites ---
    print("  Calculating prerequisites...")
    min_periods_base = 2

    # Basic Returns/Changes
    df['price_return_1h_temp'] = df['close'].pct_change() # Needed for skew/kurt
    df['volume_return_1h'] = df['volume_btc'].pct_change()
    with np.errstate(divide='ignore', invalid='ignore'):
        df['price_range_pct_temp'] = (df['high'] - df['low']) / df['low'].replace(0, np.nan) # Needed for volume_btc_x_range

    # Lags
    for hours in [12, 24, 48, 72, 168]: df[f'lag_{hours}h_price_return'] = df['close'].pct_change(periods=hours)
    for hours in [3, 6, 12, 24]: df[f'lag_{hours}h_volume_return'] = df['volume_btc'].pct_change(periods=hours)

    # Rolling Stats (Only calculate those needed)
    needed_rolling_std_hours = [6, 48, 168] # 48 added
    for hours in needed_rolling_std_hours:
        if len(df) >= hours: df[f'rolling_std_{hours}h'] = df['close'].rolling(window=hours, min_periods=min_periods_base).std()
        else: df[f'rolling_std_{hours}h'] = np.nan
    # Intermediates for ratios/squares
    if len(df) >= 12: df['rolling_std_12h_temp'] = df['close'].rolling(window=12, min_periods=min_periods_base).std()
    else: df['rolling_std_12h_temp'] = np.nan
    if len(df) >= 72: df['rolling_std_72h_temp'] = df['close'].rolling(window=72, min_periods=min_periods_base).std()
    else: df['rolling_std_72h_temp'] = np.nan
    if len(df) >= 3: df['rolling_std_3h_temp'] = df['close'].rolling(window=3, min_periods=min_periods_base).std()
    else: df['rolling_std_3h_temp'] = np.nan
    # Ensure rolling_std_48h exists for division (it's calculated above now)

    # Skew/Kurtosis
    if len(df) >= 24 and 'price_return_1h_temp' in df.columns:
        df['rolling_skew_24h'] = df['price_return_1h_temp'].rolling(window=24, min_periods=24).skew()
        df['rolling_kurt_24h'] = df['price_return_1h_temp'].rolling(window=24, min_periods=24).kurt()
    else:
        df['rolling_skew_24h'] = np.nan
        df['rolling_kurt_24h'] = np.nan

    # Volume MAs and Ratios
    needed_vol_ma_hours = [12, 168]
    for hours in needed_vol_ma_hours:
        if len(df) >= hours: df[f'volume_ma_{hours}h'] = df['volume_btc'].rolling(window=hours, min_periods=min_periods_base).mean()
        else: df[f'volume_ma_{hours}h'] = np.nan
    # Need MA24 for volume_div_ma_24h
    if len(df) >= 24: df['volume_ma_24h_temp'] = df['volume_btc'].rolling(window=24, min_periods=min_periods_base).mean()
    else: df['volume_ma_24h_temp'] = np.nan
    with np.errstate(divide='ignore', invalid='ignore'):
        df['volume_div_ma_24h'] = df['volume_btc'] / df['volume_ma_24h_temp'].replace(0, np.nan)

    # MACD Components
    if len(df) >= 35: # Need 26 for base + 9 for signal
        ema_12 = df['close'].ewm(span=12, adjust=False, min_periods=12).mean()
        ema_26 = df['close'].ewm(span=26, adjust=False, min_periods=26).mean()
        df['macd_temp'] = ema_12 - ema_26
        df['macd_signal'] = df['macd_temp'].ewm(span=9, adjust=False, min_periods=9).mean()
        df['macd_hist'] = df['macd_temp'] - df['macd_signal']
    else:
        df['macd_temp'] = np.nan
        df['macd_signal'] = np.nan
        df['macd_hist'] = np.nan

    # TA-Lib Indicators (CCI, CMF, BBands Width)
    ta_df = df.rename(columns={'volume_btc': 'volume'}, errors='ignore') # Use internal standard name
    if all(c in ta_df.columns for c in ['high', 'low', 'close']):
        try: df['cci_20h'] = ta_df.ta.cci(length=20)
        except Exception as e: print(f" Warning: CCI calc failed: {e}"); df['cci_20h'] = np.nan

        if 'volume' in ta_df.columns:
             try: df['cmf_20h'] = ta_df.ta.cmf(length=20)
             except Exception as e: print(f" Warning: CMF calc failed: {e}"); df['cmf_20h'] = np.nan
        else: df['cmf_20h'] = np.nan

        try:
            bbands_df = ta_df.ta.bbands(length=20, std=2)
            if bbands_df is not None: df['bband_width_20h'] = bbands_df.get(f'BBB_20_2.0', np.nan)
            else: df['bband_width_20h'] = np.nan
        except Exception as e: print(f" Warning: BBand Width calc failed: {e}"); df['bband_width_20h'] = np.nan
    else:
        df['cci_20h'], df['cmf_20h'], df['bband_width_20h'] = np.nan, np.nan, np.nan

    # Position in Range
    range_hl = df['high'] - df['low']
    with np.errstate(divide='ignore', invalid='ignore'):
        df['close_pos_in_range'] = ((df['close'] - df['low']) / range_hl.replace(0, np.nan)).fillna(0.5).replace([np.inf, -np.inf], 0.5)

    # Ratio std12_div_std72
    if 'rolling_std_12h_temp' in df.columns and 'rolling_std_72h_temp' in df.columns:
         with np.errstate(divide='ignore', invalid='ignore'):
             df['std12_div_std72'] = df['rolling_std_12h_temp'] / df['rolling_std_72h_temp'].replace(0, np.nan)
    else: df['std12_div_std72'] = np.nan

    # Interaction volume_btc_x_range
    if 'price_range_pct_temp' in df.columns:
         df['volume_btc_x_range'] = df['volume_btc'] * df['price_range_pct_temp']
    else: df['volume_btc_x_range'] = np.nan

    # Time Features
    hour_of_day = df.index.hour
    day_of_week = df.index.dayofweek
    # Use full range for calculation, selection happens later
    for hour in range(24): df[f'hour_{hour}'] = (hour_of_day == hour).astype(int)
    for day in range(7): df[f'day_{day}'] = (day_of_week == day).astype(int)


    # --- 2. Calculate Final Interaction and Transformation Features ---
    print("  Calculating final derived features...")
    final_feature_dict = {} # Store results here

    # Helper functions
    def safe_multiply(col1_name, col2_name):
        col1_actual = 'volume_btc' if col1_name == 'Volume BTC' else col1_name
        col2_actual = 'volume_btc' if col2_name == 'Volume BTC' else col2_name
        if col1_actual in df.columns and col2_actual in df.columns: return df[col1_actual] * df[col2_actual]
        return pd.Series(np.nan, index=df.index)
    def safe_divide(col1_name, col2_name):
         if col1_name in df.columns and col2_name in df.columns:
              with np.errstate(divide='ignore', invalid='ignore'): return df[col1_name] / df[col2_name].replace(0, np.nan)
         return pd.Series(np.nan, index=df.index)
    def safe_sq(col_name):
         if col_name in df.columns: return df[col_name]**2
         return pd.Series(np.nan, index=df.index)
    def safe_log1p(col_name):
         if col_name in df.columns: return np.log1p(df[col_name].clip(lower=0))
         return pd.Series(np.nan, index=df.index)

    # Add direct/base features that are part of the final list first
    direct_features_in_final_list = [
        'Volume USD', 'close_pos_in_range', 'cmf_20h', 'lag_12h_volume_return',
        'rolling_kurt_24h', 'rolling_skew_24h', 'rolling_std_168h', 'std12_div_std72',
        'volume_btc_x_range', 'volume_ma_12h', 'volume_ma_168h', 'volume_return_1h',
        'macd_signal', 'macd_hist'
    ]
    # Use SELECTED_FEATURE_NAMES_INPUT here (or rename the global one)
    direct_features_to_add = [f for f in direct_features_in_final_list if f in SELECTED_FEATURE_NAMES_INPUT]
    for feat in direct_features_to_add:
        if feat in df.columns: final_feature_dict[feat] = df[feat]

    # Add requested Time features
    for hour in range(24):
         feat_name = f'hour_{hour}'
         if feat_name in SELECTED_FEATURE_NAMES_INPUT: final_feature_dict[feat_name] = df[feat_name]
    for day in range(7):
         feat_name = f'day_{day}'
         if feat_name in SELECTED_FEATURE_NAMES_INPUT: final_feature_dict[feat_name] = df[feat_name]

    # Calculate Interaction/Transformation Features (Only if requested in SELECTED_FEATURE_NAMES_INPUT)
    def add_if_requested(name, calculation):
        if name in SELECTED_FEATURE_NAMES_INPUT: # Check against the input list
            final_feature_dict[name] = calculation

    add_if_requested('Volume BTC_x_rolling_std_168h', safe_multiply('Volume BTC', 'rolling_std_168h'))
    add_if_requested('Volume BTC_x_std12_div_std72', safe_multiply('Volume BTC', 'std12_div_std72'))
    add_if_requested('cci_20h_sq', safe_sq('cci_20h'))
    add_if_requested('cci_20h_x_Volume BTC', safe_multiply('cci_20h', 'Volume BTC'))
    add_if_requested('cci_20h_x_cmf_20h', safe_multiply('cci_20h', 'cmf_20h'))
    add_if_requested('cci_20h_x_lag_24h_volume_return', safe_multiply('cci_20h', 'lag_24h_volume_return'))
    add_if_requested('cci_20h_x_lag_3h_volume_return', safe_multiply('cci_20h', 'lag_3h_volume_return'))
    add_if_requested('cci_20h_x_rolling_kurt_24h', safe_multiply('cci_20h', 'rolling_kurt_24h'))
    add_if_requested('cci_20h_x_rolling_std_168h', safe_multiply('cci_20h', 'rolling_std_168h'))
    add_if_requested('cci_20h_x_rolling_std_48h', safe_multiply('cci_20h', 'rolling_std_48h'))
    add_if_requested('cci_20h_x_std12_div_std72', safe_multiply('cci_20h', 'std12_div_std72'))
    add_if_requested('cci_20h_x_volume_div_ma_24h', safe_multiply('cci_20h', 'volume_div_ma_24h'))
    add_if_requested('cci_20h_x_volume_ma_168h', safe_multiply('cci_20h', 'volume_ma_168h'))
    add_if_requested('cmf_20h_x_bband_width_20h', safe_multiply('cmf_20h', 'bband_width_20h'))
    add_if_requested('cmf_20h_x_rolling_kurt_24h', safe_multiply('cmf_20h', 'rolling_kurt_24h'))
    add_if_requested('cmf_20h_x_rolling_std_168h', safe_multiply('cmf_20h', 'rolling_std_168h'))
    add_if_requested('cmf_20h_x_rolling_std_6h', safe_multiply('cmf_20h', 'rolling_std_6h'))
    add_if_requested('cmf_20h_x_std12_div_std72', safe_multiply('cmf_20h', 'std12_div_std72'))
    add_if_requested('lag_12h_price_return_sq', safe_sq('lag_12h_price_return'))
    add_if_requested('lag_12h_price_return_x_cmf_20h', safe_multiply('lag_12h_price_return', 'cmf_20h'))
    add_if_requested('lag_12h_price_return_x_rolling_kurt_24h', safe_multiply('lag_12h_price_return', 'rolling_kurt_24h'))
    add_if_requested('lag_12h_price_return_x_rolling_std_168h', safe_multiply('lag_12h_price_return', 'rolling_std_168h'))
    add_if_requested('lag_12h_price_return_x_volume_ma_168h', safe_multiply('lag_12h_price_return', 'volume_ma_168h'))
    add_if_requested('lag_168h_price_return_sq', safe_sq('lag_168h_price_return'))
    add_if_requested('lag_168h_price_return_x_Volume BTC', safe_multiply('lag_168h_price_return', 'Volume BTC'))
    add_if_requested('lag_168h_price_return_x_bband_width_20h', safe_multiply('lag_168h_price_return', 'bband_width_20h'))
    add_if_requested('lag_168h_price_return_x_cmf_20h', safe_multiply('lag_168h_price_return', 'cmf_20h'))
    add_if_requested('lag_168h_price_return_x_rolling_kurt_24h', safe_multiply('lag_168h_price_return', 'rolling_kurt_24h'))
    add_if_requested('lag_168h_price_return_x_rolling_std_168h', safe_multiply('lag_168h_price_return', 'rolling_std_168h'))
    add_if_requested('lag_168h_price_return_x_rolling_std_6h', safe_multiply('lag_168h_price_return', 'rolling_std_6h'))
    add_if_requested('lag_168h_price_return_x_std12_div_std72', safe_multiply('lag_168h_price_return', 'std12_div_std72'))
    add_if_requested('lag_168h_price_return_x_volume_div_ma_24h', safe_multiply('lag_168h_price_return', 'volume_div_ma_24h'))
    add_if_requested('lag_168h_price_return_x_volume_ma_168h', safe_multiply('lag_168h_price_return', 'volume_ma_168h'))
    add_if_requested('lag_24h_price_return_sq', safe_sq('lag_24h_price_return'))
    add_if_requested('lag_24h_price_return_x_lag_12h_volume_return', safe_multiply('lag_24h_price_return', 'lag_12h_volume_return'))
    add_if_requested('lag_24h_price_return_x_lag_6h_volume_return', safe_multiply('lag_24h_price_return', 'lag_6h_volume_return'))
    add_if_requested('lag_24h_price_return_x_rolling_std_168h', safe_multiply('lag_24h_price_return', 'rolling_std_168h'))
    add_if_requested('lag_24h_price_return_x_volume_ma_168h', safe_multiply('lag_24h_price_return', 'volume_ma_168h'))
    add_if_requested('lag_24h_volume_return_x_rolling_std_168h', safe_multiply('lag_24h_volume_return', 'rolling_std_168h'))
    add_if_requested('lag_24h_volume_return_x_std12_div_std72', safe_multiply('lag_24h_volume_return', 'std12_div_std72'))
    add_if_requested('lag_3h_volume_return_x_rolling_kurt_24h', safe_multiply('lag_3h_volume_return', 'rolling_kurt_24h'))
    add_if_requested('lag_48h_price_return_sq', safe_sq('lag_48h_price_return'))
    add_if_requested('lag_48h_price_return_x_lag_24h_volume_return', safe_multiply('lag_48h_price_return', 'lag_24h_volume_return'))
    add_if_requested('lag_48h_price_return_x_rolling_std_168h', safe_multiply('lag_48h_price_return', 'rolling_std_168h'))
    add_if_requested('lag_48h_price_return_x_volume_div_ma_24h', safe_multiply('lag_48h_price_return', 'volume_div_ma_24h'))
    add_if_requested('lag_48h_price_return_x_volume_ma_168h', safe_multiply('lag_48h_price_return', 'volume_ma_168h'))
    add_if_requested('lag_6h_volume_return_x_rolling_kurt_24h', safe_multiply('lag_6h_volume_return', 'rolling_kurt_24h'))
    add_if_requested('lag_72h_price_return_sq', safe_sq('lag_72h_price_return'))
    add_if_requested('lag_72h_price_return_x_cmf_20h', safe_multiply('lag_72h_price_return', 'cmf_20h'))
    add_if_requested('lag_72h_price_return_x_lag_12h_volume_return', safe_multiply('lag_72h_price_return', 'lag_12h_volume_return'))
    add_if_requested('lag_72h_price_return_x_lag_3h_volume_return', safe_multiply('lag_72h_price_return', 'lag_3h_volume_return'))
    add_if_requested('lag_72h_price_return_x_lag_6h_volume_return', safe_multiply('lag_72h_price_return', 'lag_6h_volume_return'))
    add_if_requested('lag_72h_price_return_x_rolling_kurt_24h', safe_multiply('lag_72h_price_return', 'rolling_kurt_24h'))
    add_if_requested('lag_72h_price_return_x_rolling_std_168h', safe_multiply('lag_72h_price_return', 'rolling_std_168h'))
    add_if_requested('lag_72h_price_return_x_volume_ma_168h', safe_multiply('lag_72h_price_return', 'volume_ma_168h'))
    add_if_requested('macd_hist_sq', safe_sq('macd_hist'))
    add_if_requested('macd_hist_x_Volume BTC', safe_multiply('macd_hist', 'Volume BTC'))
    add_if_requested('macd_hist_x_cmf_20h', safe_multiply('macd_hist', 'cmf_20h'))
    add_if_requested('macd_hist_x_rolling_kurt_24h', safe_multiply('macd_hist', 'rolling_kurt_24h'))
    add_if_requested('macd_hist_x_rolling_std_6h', safe_multiply('macd_hist', 'rolling_std_6h'))
    add_if_requested('macd_hist_x_volume_div_ma_24h', safe_multiply('macd_hist', 'volume_div_ma_24h'))
    add_if_requested('macd_hist_x_volume_ma_12h', safe_multiply('macd_hist', 'volume_ma_12h'))
    add_if_requested('macd_signal_sq', safe_sq('macd_signal'))
    add_if_requested('macd_signal_x_cmf_20h', safe_multiply('macd_signal', 'cmf_20h'))
    add_if_requested('macd_signal_x_lag_24h_volume_return', safe_multiply('macd_signal', 'lag_24h_volume_return'))
    add_if_requested('macd_signal_x_rolling_kurt_24h', safe_multiply('macd_signal', 'rolling_kurt_24h'))
    add_if_requested('macd_signal_x_rolling_std_48h', safe_multiply('macd_signal', 'rolling_std_48h'))
    add_if_requested('macd_signal_x_rolling_std_6h', safe_multiply('macd_signal', 'rolling_std_6h'))
    add_if_requested('macd_signal_x_std12_div_std72', safe_multiply('macd_signal', 'std12_div_std72'))
    add_if_requested('macd_signal_x_volume_div_ma_24h', safe_multiply('macd_signal', 'volume_div_ma_24h'))
    add_if_requested('macd_signal_x_volume_ma_12h', safe_multiply('macd_signal', 'volume_ma_12h'))
    add_if_requested('rolling_std_3h_sq', safe_sq('rolling_std_3h_temp'))
    add_if_requested('rolling_std_6h_div_rolling_std_48h', safe_divide('rolling_std_6h', 'rolling_std_48h'))
    add_if_requested('volume_div_ma_24h_sq', safe_sq('volume_div_ma_24h'))
    add_if_requested('volume_div_ma_24h_x_rolling_kurt_24h', safe_multiply('volume_div_ma_24h', 'rolling_kurt_24h'))
    add_if_requested('volume_div_ma_24h_x_rolling_std_168h', safe_multiply('volume_div_ma_24h', 'rolling_std_168h'))
    add_if_requested('volume_ma_12h_x_bband_width_20h', safe_multiply('volume_ma_12h', 'bband_width_20h'))
    add_if_requested('volume_ma_12h_x_rolling_kurt_24h', safe_multiply('volume_ma_12h', 'rolling_kurt_24h'))
    add_if_requested('volume_ma_12h_x_rolling_std_6h', safe_multiply('volume_ma_12h', 'rolling_std_6h'))
    add_if_requested('volume_ma_168h_x_rolling_kurt_24h', safe_multiply('volume_ma_168h', 'rolling_kurt_24h'))
    add_if_requested('volume_ma_168h_x_rolling_std_48h', safe_multiply('volume_ma_168h', 'rolling_std_48h'))
    add_if_requested('volume_ma_168h_x_std12_div_std72', safe_multiply('volume_ma_168h', 'std12_div_std72'))
    add_if_requested('volume_return_1h_x_rolling_kurt_24h', safe_multiply('volume_return_1h', 'rolling_kurt_24h'))
    add_if_requested('volume_btc_x_range_log1p', safe_log1p('volume_btc_x_range'))


    # --- 3. Final Assembly and Cleanup ---
    print("  Assembling final dataframe...")
    # Create DataFrame from the calculated features in the dictionary
    df_final_features = pd.DataFrame(final_feature_dict, index=df.index)

    # Combine essential columns from original df with calculated features
    essential_cols = ['timestamp', 'symbol', 'open', 'high', 'low', 'close']
    # Ensure essential columns exist before concatenation
    essential_cols_present = [col for col in essential_cols if col in df.columns]
    df_combined = pd.concat([df[essential_cols_present], df_final_features], axis=1)


    # Define the list of columns to keep: essential + the globally defined SELECTED_FEATURE_NAMES_INPUT
    # Ensure we only try to keep essential columns that actually exist
    # Use SELECTED_FEATURE_NAMES_INPUT here
    cols_to_keep = essential_cols_present + SELECTED_FEATURE_NAMES_INPUT

    # Select final columns, ensuring all requested are present, filling missing with NaN
    # This step ensures the final df has exactly the requested feature columns + essentials
    final_df_structure = pd.DataFrame(index=df_combined.index)
    present_cols_count = 0
    missing_final_cols = []

    for col in cols_to_keep:
        if col in df_combined.columns:
            final_df_structure[col] = df_combined[col]
            # Count only actual features added
            if col not in essential_cols_present:
                present_cols_count += 1
        else:
            # This happens if a feature in SELECTED_FEATURE_NAMES_INPUT wasn't generated correctly
            missing_final_cols.append(col)
            final_df_structure[col] = np.nan

    if missing_final_cols:
        print(f"  Final Warning: {len(missing_final_cols)} columns from SELECTED_FEATURE_NAMES_INPUT "
              f"were missing in the combined df and added as NaN: {missing_final_cols}")

    # Final cleanup
    final_df_structure = final_df_structure.reset_index(drop=True)
    final_df_structure = final_df_structure.replace([np.inf, -np.inf], np.nan)

    end_time = time.time()
    actual_feature_count = len([col for col in final_df_structure.columns if col not in essential_cols_present])
    print(f"Selected feature calculation finished. Returning {len(final_df_structure)} rows, "
          f"{len(final_df_structure.columns)} total columns ({actual_feature_count} features). "
          f"Took {end_time - start_time:.2f}s.")

    # Verify final column count against the request
    expected_feature_count = len(SELECTED_FEATURE_NAMES_INPUT) # Use Input list for check
    if actual_feature_count != expected_feature_count:
        # It's possible fewer were generated if prerequisites weren't met early on
        print(f"  NOTE: Expected {expected_feature_count} features based on SELECTED_FEATURE_NAMES_INPUT, "
              f"but returning DataFrame with {actual_feature_count} non-essential features.")

    return final_df_structure


# --- Main VIF Calculation Block ---
if __name__ == "__main__":
    print("--- VIF Feature Selection Script ---")

    print("\n--- 1. Data Loading & Initial Prep ---")
    try:
        print(f"Loading data from: {CSV_FILE_PATH}")
        col_names = ['unix', 'date', 'symbol_csv', 'open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD']
        df_raw = pd.read_csv(CSV_FILE_PATH, header=0, names=col_names)
        print(f"Raw data loaded. Shape: {df_raw.shape}")
        df_raw['timestamp'] = pd.to_datetime(df_raw['date'])
        df_raw = df_raw.drop(['unix', 'date', 'symbol_csv'], axis=1)
        df_raw = df_raw.sort_values('timestamp').reset_index(drop=True)
        if df_raw.empty: exit("DataFrame empty after loading. Exiting.")
        print(f"Initial data prep done. Shape: {df_raw.shape}")
    except Exception as e:
        print(f"Error loading or processing CSV: {e}"); traceback.print_exc(); exit()

    print("\n--- 2. Feature Engineering (Generating 123 Features) ---")
    feature_calc_start = time.time()
    # This function needs to be defined above or imported
    df_with_123_features = calculate_selected_features(df_raw, symbol=SYMBOL_NAME)
    feature_calc_end = time.time()
    if df_with_123_features.empty: exit("Feature calculation failed. Exiting.")
    print(f"Feature calculation completed in {feature_calc_end - feature_calc_start:.2f} seconds.")

    # Identify the actual feature columns present from the input list
    actual_feature_cols = [col for col in SELECTED_FEATURE_NAMES_INPUT if col in df_with_123_features.columns]
    if len(actual_feature_cols) != len(SELECTED_FEATURE_NAMES_INPUT):
        print(f"Warning: Only found {len(actual_feature_cols)} out of {len(SELECTED_FEATURE_NAMES_INPUT)} requested features in the generated DataFrame.")
    if not actual_feature_cols:
        exit("Error: No features from the input list found in the generated DataFrame.")

    print(f"\n--- 3. Preparing Data for VIF (Using {len(actual_feature_cols)} features) ---")
    # Select only the feature columns for VIF calculation
    X_features = df_with_123_features[actual_feature_cols].copy()

    # Ensure all columns are numeric (attempt conversion, drop non-numeric if necessary)
    for col in X_features.columns:
        X_features[col] = pd.to_numeric(X_features[col], errors='coerce')
    initial_cols = set(X_features.columns)
    X_features = X_features.select_dtypes(include=np.number)
    final_numeric_cols = set(X_features.columns)
    dropped_non_numeric = initial_cols - final_numeric_cols
    if dropped_non_numeric:
        print(f"  Warning: Dropped non-numeric columns before VIF: {dropped_non_numeric}")
    if X_features.empty:
        exit("Error: No numeric feature columns left for VIF calculation.")


    # Impute missing values (necessary for VIF) - Using Median
    print(f"  Imputing NaNs using median strategy...")
    imputer = SimpleImputer(strategy='median')
    X_features_imputed = imputer.fit_transform(X_features)
    X_features_imputed_df = pd.DataFrame(X_features_imputed, columns=X_features.columns, index=X_features.index)
    print(f"  NaN count after imputation: {X_features_imputed_df.isnull().sum().sum()}") # Should be 0

    # Add constant for VIF calculation
    print("  Adding constant for VIF calculation...")
    X_vif_ready = add_constant(X_features_imputed_df, has_constant='add')

    print("\n--- 4. Iterative VIF Calculation ---")
    print(f"Starting VIF filtering with threshold {VIF_THRESHOLD}...")

    features_to_keep = list(X_features_imputed_df.columns) # Start with all numeric features
    vif_data = X_vif_ready[features_to_keep + ['const']] # Select data including constant

    dropped_features_count = 0
    while True:
        vif_results = pd.Series(
            [variance_inflation_factor(vif_data.values, i)
             for i in range(vif_data.shape[1] - 1)], # Exclude constant
            index=features_to_keep, # Use current features as index
            dtype=float
        )

        max_vif = vif_results.max()
        if max_vif > VIF_THRESHOLD:
            feature_to_drop = vif_results.idxmax()
            print(f"  Dropping '{feature_to_drop}' with VIF: {max_vif:.4f}")
            features_to_keep.remove(feature_to_drop)
            vif_data = vif_data.drop(columns=[feature_to_drop]) # Drop from the data used for next VIF calc
            dropped_features_count += 1
        else:
            print(f"  All remaining features have VIF <= {VIF_THRESHOLD}.")
            break # Exit loop

        # Safety break in case of infinite loop (shouldn't happen)
        if not features_to_keep:
            print("  Warning: All features dropped during VIF process!")
            break

    print("\n--- 5. VIF Selection Results ---")
    print(f"VIF Threshold: {VIF_THRESHOLD}")
    print(f"Original number of features considered: {len(actual_feature_cols)}")
    print(f"Number of features dropped: {dropped_features_count}")
    print(f"Number of features remaining: {len(features_to_keep)}")
    print("\nFinal selected features (VIF < " + str(VIF_THRESHOLD) + "):")
    # Print the list in a readable format
    print("SELECTED_FEATURE_NAMES_VIF_FILTERED = [")
    for i, feat in enumerate(features_to_keep):
        print(f"    '{feat}'," + ("" if i == len(features_to_keep) - 1 else ""))
    print("]")

    print("\nScript finished.")

--- VIF Feature Selection Script ---

--- 1. Data Loading & Initial Prep ---
Loading data from: C:\Users\mason\AVP\BTCUSDrec.csv
Raw data loaded. Shape: (15177, 9)
Initial data prep done. Shape: (15177, 7)

--- 2. Feature Engineering (Generating 123 Features) ---
Starting calculation of selected 123 features...
  Calculating prerequisites...
  Calculating final derived features...
  Assembling final dataframe...
Selected feature calculation finished. Returning 15177 rows, 129 total columns (123 features). Took 0.14s.
Feature calculation completed in 0.15 seconds.

--- 3. Preparing Data for VIF (Using 123 features) ---
  Imputing NaNs using median strategy...
  NaN count after imputation: 0
  Adding constant for VIF calculation...

--- 4. Iterative VIF Calculation ---
Starting VIF filtering with threshold 1.73...
  Dropping 'lag_48h_price_return_x_lag_24h_volume_return' with VIF: 418.5294
  Dropping 'cci_20h_x_lag_24h_volume_return' with VIF: 61.6671
  Dropping 'lag_24h_volume_return_x_