In [None]:
import sys
from pathlib import Path
import pandas as pd
import os
from IPython.display import display, Markdown  # Assuming you use these for display


# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 1500)        # Let the display adjust to the window


# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils




In [None]:
path_OHLCV, _, _ = utils.main_processor(
    data_dir='..\data',  
    # data_dir='output\selection_results',  # search project ..\data
    downloads_dir=None,  # None searchs Downloads dir, '' omits search1
    downloads_limit=60,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='2025', # search for files starting with 'df_'
    contains_pattern='df_OHLCV' # search for files containing 'df_'
)

print(f'path_OHLCV: {path_OHLCV}')
df_OHLCV = pd.read_parquet(path_OHLCV)
print(f'df_OHLCV:\n{df_OHLCV.head()}\n')
print(f'df_OHLCV.info():\n{df_OHLCV.info()}')

In [None]:
import pandas as pd

# Assuming your DataFrame is already loaded and named df_data
# Example:
# df_data = pd.read_pickle('your_data.pkl') # Or however you loaded it

# 1. Select the 'Adj Close' column (this returns a Series with the MultiIndex)
adj_close_series = df_OHLCV['Adj Close']

# 2. Unstack the Ticker level (level 0) of the MultiIndex to become columns
#    The Date level (level 1) will remain as the index.
df_adj_close = adj_close_series.unstack(level=0)

# 3. Optional: Sort the index (Dates) if it's not already sorted
df_adj_close = df_adj_close.sort_index()

# 4. Optional: Sort columns (Tickers) alphabetically if desired
df_adj_close = df_adj_close.sort_index(axis=1)

# Display the results
print("--- Resulting DataFrame for Backtesting (df_adj_close) ---")
print(df_adj_close.info())
print("\n--- First 5 rows of df_adj_close: ---")
print(df_adj_close.head())
print("\n--- Last 5 rows of df_adj_close: ---")
print(df_adj_close.tail())

df_adj_close.to_parquet('df_adj_close.parquet', index=True)
print(f"\nSaved df_adj_close to 'df_adj_close.parquet' in {Path.cwd()}")
_df = pd.read_parquet('df_adj_close.parquet')
print(f"\nLoaded df_adj_close from 'df_adj_close.parquet':\n{_df.head()}\n")

In [None]:
_, _, selection_list = utils.main_processor(
    # data_dir='..\ ',  # search project ..\data
    data_dir='output\selection_results',  # search project ..\data
    downloads_dir=None,  # None searchs Downloads dir, '' omits search1
    downloads_limit=60,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='2025', # search for files starting with 'df_'
    contains_pattern='.parquet',  # search for files containing 'df_'
)

print(f'selection_list: {selection_list}')

In [None]:
# --- Method 1: Using os.path.splitext (Recommended) ---
# This is generally safer as it correctly handles filenames with multiple dots.
params_list = []
for parquet_filename in selection_list:
    # Split the filename into base and extension
    base_name, _ = os.path.splitext(parquet_filename)
    # Construct the JSON filename
    json_filename = f"{base_name}_params.json"
    params_list.append(json_filename)

print("--- Using os.path.splitext ---")
print(f'params_list: {params_list}') # print(params_list)
print(f'selection_list: {selection_list}')

In [None]:
import pandas as pd
import io
import pprint # For cleaner dictionary printing
import numpy as np
import traceback # Added for detailed error logging
from typing import List, Dict, Any, Optional # Added Optional
import json
import os # For creating directories and checking file existence
import logging # For logging instead of print
import datetime # For timestamping runs

# --- Constants ---
RISK_FREE_RATE_DAILY = 0.04 / 365
LOG_DIR = 'logs'
RESULTS_CSV_PATH = os.path.join(LOG_DIR, 'backtest_parameter_performance.csv')
PARAMS_TO_TRACK = [
    'n_select_requested',
    'inv_vol_col_name',
    'filter_min_price',
    'filter_min_avg_volume_m',
    'filter_min_roe_pct',
    'filter_max_debt_eq',
    'score_weight_rsi',
    'score_weight_change',
    'score_weight_rel_volume',
    'score_weight_volatility',
    # 'weight' (scheme) is handled separately
]

# --- 1. Setup Logging ---
def setup_logging(log_dir: str = LOG_DIR):
    """Configures logging to write to a file."""
    os.makedirs(log_dir, exist_ok=True)
    log_filename = datetime.datetime.now().strftime("backtest_run_%Y%m%d_%H%M%S.log")
    log_filepath = os.path.join(log_dir, log_filename)

    logging.basicConfig(
        level=logging.INFO, # Set minimum level to log (DEBUG, INFO, WARNING, ERROR, CRITICAL)
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_filepath), # Log to file
            logging.StreamHandler() # Also log to console (optional, remove if only file needed)
        ]
    )
    logging.info(f"Logging initialized. Log file: {log_filepath}")
    return log_filepath # Return the path for potential reference

# --- 2. Refined Functions (using logging instead of print) ---

def extract_backtest_setups(
    dataframe: pd.DataFrame,
    weight_column_names: List[str],
    date_str: str, # <<< Still need the date identifier
    scheme_separator: str = '_'
    ) -> Dict[str, Dict[str, Dict[str, float]]]: # <<< Return type changed
    """
    Extracts Ticker-Weight pairs from specified columns in a DataFrame.

    Organizes the data into a nested dictionary structure suitable for running
    backtests associated with a specific date. The top-level key is the date string,
    and the value is a dictionary containing the individual backtest setups
    (scheme name -> {Ticker: Weight}). Outputs are logged.    

    Args:
        dataframe (pd.DataFrame): The input DataFrame. Must have Tickers as index
                                  and contain the specified weight columns.
        weight_column_names (List[str]): A list of column names containing the
                                         weights for different schemes
                                         (e.g., ['Weight_EW', 'Weight_IV']).
        date_str (str): A string representing the date (e.g., '20231231') to be
                        used as the primary key identifying this batch of
                        backtest setups.
        scheme_separator (str): The character used to separate the prefix (like 'Weight')
                                from the scheme name (like 'EW') in the column name.
                                Defaults to '_'.

    Returns:
        Dict[str, Dict[str, Dict[str, float]]]:
            A dictionary where the single key is the `date_str`.
            The value associated with this key is another dictionary:
                - Keys are the derived scheme names (e.g., 'EW', 'IV').
                - Values are dictionaries mapping Ticker (str) to its weight (float)
                  for that scheme.
            Returns an empty dictionary `{}` if the input DataFrame is unsuitable
            or no valid weights are found for any scheme. If processing occurs
            but yields no valid setups, it might return `{date_str: {}}`.

    Raises:
        ValueError: If `date_str` is None or an empty string.

    Example Output Structure:
        {
            '20231231': {
                'EW': {'AAPL': 0.5, 'MSFT': 0.5},
                'IV': {'AAPL': 0.6, 'MSFT': 0.4}
            }
        }
        # Or {} if no valid columns/data found initially
        # Or {'20231231': {}} if processing started but no schemes had valid weights
    """


    # --- Input Validation ---
    if not date_str: # Check if date_str is None or empty
        # Use logging for errors/warnings
        logging.error("The 'date_str' argument cannot be None or empty.")
        raise ValueError("The 'date_str' argument cannot be None or empty.")

    if dataframe is None or dataframe.empty:
        logging.warning("Input DataFrame is None or empty. Cannot extract setups.")
        return {} # Return empty dict directly if DataFrame is invalid
    # --- End Input Validation ---

    scheme_setups: Dict[str, Dict[str, float]] = {}

    for col_name in weight_column_names:
        if col_name in dataframe.columns:
            try:
                parts = col_name.split(scheme_separator)
                scheme_name = parts[-1] if len(parts) > 1 else col_name

                ticker_weights = dataframe[col_name].astype(float).to_dict()
                ticker_weights = {
                    ticker: weight
                    for ticker, weight in ticker_weights.items()
                    if pd.notna(weight) # Filter out NaN weights explicitly
                }

                if ticker_weights:
                  if scheme_name in scheme_setups:
                        logging.warning(f"Duplicate scheme name '{scheme_name}' derived. "
                                        f"Weights from column '{col_name}' might overwrite previous ones.")
                  scheme_setups[scheme_name] = ticker_weights
                  logging.info(f"Successfully extracted weights for scheme: {scheme_name} "
                                f"({len(ticker_weights)} tickers) for date {date_str}")
                else:
                  logging.warning(f"No valid (non-NaN) weights found for column '{col_name}'. "
                                  f"Skipping scheme '{scheme_name}' for date '{date_str}'.")

            except Exception as e:
                logging.error(f"Error processing column '{col_name}': {e}", exc_info=True) # Log traceback
        else:
            logging.warning(f"Column '{col_name}' not found in the DataFrame.")

    final_output = {date_str: scheme_setups}

    if not scheme_setups:
      logging.warning(f"No valid backtest setups generated for date {date_str}.")
      # Still return the structure, it might be expected upstream
      # return {} # Uncomment if you strictly want {} on no setups

    return final_output


def run_single_backtest(
    selection_date: str,
    scheme_name: str,
    ticker_weights: Dict[str, float],
    df_adj_close: pd.DataFrame,
    risk_free_rate_daily: float = RISK_FREE_RATE_DAILY,
    ) -> Optional[Dict[str, Any]]:
    """
    Runs a simple backtest for a given selection date and ticker weights.
    The strategy buys the selected tickers on the next trading day after
    selection_date and sells them on the subsequent trading day.
    Calculates individual ticker returns and the overall portfolio return
    based on the provided weights for the successful trades.
    Ensures the input DataFrame's index is sorted by date.
    Outputs are logged.

    Args:
        selection_date (str): The date portfolio weights are determined (YYYY-MM-DD).
        scheme_name (str): The name of the weighting scheme (e.g., 'EW', 'IV').
        ticker_weights (Dict[str, float]): Dictionary mapping tickers to weights.
                                           Weights are used for calculating portfolio return.
        df_adj_close (pd.DataFrame): DataFrame of adjusted closing prices, indexed by date.
                                     The index should be DatetimeIndex or convertible to it.
        risk_free_rate_daily (float): Daily risk-free rate for Sharpe calculation. Defaults to 0.

    Returns:
        Optional[Dict[str, Any]]: A dictionary containing backtest inputs, trade details,
                                  and performance metrics (including portfolio_return),
                                  or None if the backtest cannot be run.
    """

    logging.info("-" * 30)
    logging.info(f"Initiating Backtest Run...")
    logging.info(f"  Date          : {selection_date}")
    logging.info(f"  Scheme        : {scheme_name}")
    logging.info(f"  Num Tickers   : {len(ticker_weights)}")
    # Log sample weights less verbosely
    sample_weights_str = io.StringIO()
    pprint.pprint(dict(list(ticker_weights.items())[:3]), stream=sample_weights_str)
    if len(ticker_weights) > 3: sample_weights_str.write("    ...\n")
    logging.debug(f"  Sample Weights:\n{sample_weights_str.getvalue()}") # Use DEBUG level

    # --- Input Data Preparation ---
    try:
        # Ensure index is DatetimeIndex and sorted
        if not isinstance(df_adj_close.index, pd.DatetimeIndex):
            try:
                df_adj_close = df_adj_close.copy() # Avoid modifying original df
                df_adj_close.index = pd.to_datetime(df_adj_close.index)
                logging.info("  Info: Converted DataFrame index to DatetimeIndex.")
            except Exception as e:
                logging.error(f"  Error: Failed to convert DataFrame index to DatetimeIndex: {e}", exc_info=True)
                logging.info("-" * 30)
                return None

        if not df_adj_close.index.is_monotonic_increasing:
            logging.info("  Info: Sorting DataFrame index by date...")
            df_adj_close = df_adj_close.sort_index()
            logging.info("  Info: DataFrame index sorted.")

        all_trading_dates = df_adj_close.index
        selection_timestamp = pd.Timestamp(selection_date)

    except Exception as e:
        logging.error(f"  Error during initial data preparation: {e}", exc_info=True)
        logging.info("-" * 30)
        return None

    # --- Backtesting Logic ---
    try:
        # 1. Identify Trading Dates
        try:
            indexer = all_trading_dates.get_indexer([selection_timestamp])
            if indexer[0] == -1:
                logging.error(f"  Error: Selection date {selection_date} not found in price data index.")
                logging.info("-" * 30)
                return None
            selection_loc = indexer[0]

        except KeyError:
            logging.error(f"  Error: Selection date {selection_date} not found in price data index (KeyError).")
            logging.info("-" * 30)
            return None

        if selection_loc + 1 >= len(all_trading_dates):
            logging.error(f"  Error: No trading date found after selection date {selection_date}.")
            logging.info("-" * 30)
            return None
        buy_date = all_trading_dates[selection_loc + 1]

        if selection_loc + 2 >= len(all_trading_dates):
            logging.error(f"  Error: No trading date found after buy date {buy_date.strftime('%Y-%m-%d')}.")
            logging.info("-" * 30)
            return None
        sell_date = all_trading_dates[selection_loc + 2]

        logging.info(f"  Selection Date: {selection_date}")
        logging.info(f"  Buy Date      : {buy_date.strftime('%Y-%m-%d')}")
        logging.info(f"  Sell Date     : {sell_date.strftime('%Y-%m-%d')}")

        # 2. Simulate Trades & Collect Results
        trades = []
        returns = []
        portfolio_return = 0.0
        total_weight_traded = 0.0
        valid_tickers_count = 0
        missing_price_count = 0

        for ticker in ticker_weights.keys():
            if ticker not in df_adj_close.columns:
                logging.warning(f"    Warning: Ticker {ticker} not found in price data columns. Skipping.")
                continue

            valid_tickers_count += 1
            trade_info = { "ticker": ticker, "weight": ticker_weights[ticker],
                          "buy_date": buy_date.strftime('%Y-%m-%d'), "sell_date": sell_date.strftime('%Y-%m-%d'),
                          "buy_price": None, "sell_price": None, "return": None, "status": "Pending" }

            try:
                buy_price = df_adj_close.at[buy_date, ticker]
                if pd.isna(buy_price) or buy_price <= 0: raise ValueError(f"Invalid buy price ({buy_price})")
                sell_price = df_adj_close.at[sell_date, ticker]
                if pd.isna(sell_price): raise ValueError(f"Invalid sell price ({sell_price})")

                trade_return = (sell_price - buy_price) / buy_price
                trade_info.update({"buy_price": buy_price, "sell_price": sell_price, "return": trade_return, "status": "Success"})
                trades.append(trade_info)
                returns.append(trade_return)

                current_weight = ticker_weights[ticker]
                portfolio_return += trade_return * current_weight
                total_weight_traded += current_weight

            except KeyError as e:
                logging.warning(f"    Error accessing price for {ticker} on {e}. Skipping trade.")
                trade_info["status"] = f"Error: Price data missing ({e})"
                trades.append(trade_info)
                missing_price_count += 1
            except ValueError as e:
                logging.warning(f"    Warning: Invalid price data for {ticker} between {buy_date.strftime('%Y-%m-%d')} and {sell_date.strftime('%Y-%m-%d')} ({e}). Skipping trade.")
                trade_info["status"] = f"Skipped: Invalid price ({e})"
                try: trade_info["buy_price"] = df_adj_close.at[buy_date, ticker]
                except: pass
                try: trade_info["sell_price"] = df_adj_close.at[sell_date, ticker]
                except: pass
                trades.append(trade_info)
                missing_price_count += 1
            except Exception as e:
                logging.error(f"    Unexpected error processing trade for {ticker}: {e}", exc_info=True)
                trade_info["status"] = f"Error: Unexpected ({type(e).__name__})"
                trades.append(trade_info)
                missing_price_count += 1

        # 3. Calculate Metrics
        num_attempted_trades = valid_tickers_count
        num_successful_trades = len(returns)
        metrics = {
            'num_selected_tickers': len(ticker_weights),
            'num_valid_tickers_in_data': valid_tickers_count,
            'num_attempted_trades': num_attempted_trades,
            'num_successful_trades': num_successful_trades,
            'num_failed_or_skipped_trades': num_attempted_trades - num_successful_trades,
            'portfolio_return': portfolio_return if num_successful_trades > 0 else 0.0,
            'total_weight_traded': total_weight_traded,
            'win_rate': None, 'average_return': None, 'std_dev_return': None, 'sharpe_ratio_period': None,
        }

        if num_successful_trades > 0:
            returns_array = np.array(returns)
            metrics['win_rate'] = np.sum(returns_array > 0) / num_successful_trades
            metrics['average_return'] = np.mean(returns_array)
            metrics['std_dev_return'] = np.std(returns_array, ddof=1) if num_successful_trades > 1 else 0.0

            if metrics['std_dev_return'] is not None and metrics['std_dev_return'] > 1e-9:
                excess_return = metrics['average_return'] - risk_free_rate_daily
                metrics['sharpe_ratio_period'] = excess_return / metrics['std_dev_return']
            elif metrics['average_return'] is not None:
                # Handle zero standard deviation
                excess_return = metrics['average_return'] - risk_free_rate_daily
                if abs(excess_return) < 1e-9: metrics['sharpe_ratio_period'] = 0.0
                else: metrics['sharpe_ratio_period'] = np.inf * np.sign(excess_return)
            else:
                metrics['sharpe_ratio_period'] = 0.0 # Or np.nan

            logging.info(f"  Trades Executed: {num_successful_trades}/{num_attempted_trades}")
            logging.info(f"  Portfolio Return: {metrics['portfolio_return']:.4f} (based on traded weight sum: {metrics['total_weight_traded']:.4f})")
            logging.info(f"  Win Rate      : {metrics['win_rate']:.2%}" if metrics['win_rate'] is not None else "N/A")
            logging.info(f"  Avg Tkr Return: {metrics['average_return']:.4f}" if metrics['average_return'] is not None else "N/A")
            logging.info(f"  Std Dev Return: {metrics['std_dev_return']:.4f}" if metrics['std_dev_return'] is not None else "N/A")
            logging.info(f"  Period Sharpe : {metrics['sharpe_ratio_period']:.4f}" if metrics['sharpe_ratio_period'] is not None else "N/A")
        else:
            logging.warning(f"  No successful trades executed out of {num_attempted_trades} attempted.")
            logging.info(f"  Portfolio Return: {metrics['portfolio_return']:.4f}")


        # 4. Store Results
        backtest_results = {
            "run_inputs": {
                "selection_date": selection_date,
                "scheme_name": scheme_name,
                "num_tickers_input": len(ticker_weights),
                "risk_free_rate_daily": risk_free_rate_daily,
                "buy_date": buy_date.strftime('%Y-%m-%d'),
                "sell_date": sell_date.strftime('%Y-%m-%d'),
            },
            "metrics": metrics,
            "trades": trades # Keep trade details if needed, otherwise remove for brevity
        }

        logging.info(f"Backtest simulation for '{scheme_name}' on {selection_date} completed.")
        logging.info("-" * 30)
        return backtest_results

    except Exception as e:
        logging.critical(f"  FATAL ERROR during backtest run for {selection_date}, {scheme_name}: {e}", exc_info=True)
        logging.info("-" * 30)
        return None


def process_all_backtests(
    nested_setups: Dict[str, Dict[str, Dict[str, float]]],
    df_adj_close: pd.DataFrame # Pass price data here
    ) -> Dict[str, Dict[str, Optional[Dict[str, Any]]]]:
    """
    Iterates through the nested setup dictionary and runs individual backtests.
    Outputs are logged.

    Args:
        nested_setups: The dictionary returned by extract_backtest_setups,
                      structured as {date_str: {scheme_name: {Ticker: Weight}}}.
        df_adj_close: DataFrame of adjusted closing prices, indexed by date, column by ticker.              

    Returns:
        A dictionary mirroring the input structure, but containing the
        results from `run_single_backtest` instead of the weights.
        {date_str: {scheme_name: backtest_results}}.
    """    
    all_results: Dict[str, Dict[str, Optional[Dict[str, Any]]]] = {}

    if not nested_setups:
        logging.warning("Received empty setup dictionary. No backtests to run.")
        return all_results

    logging.info("\n===== Starting Batch Backtest Processing =====")

    # Outer loop: Iterate through dates
    for date_str, schemes_for_date in nested_setups.items():
        logging.info(f"\nProcessing date: {date_str}")
        if not schemes_for_date:
            logging.warning(f"  No schemes found for this date. Skipping.")
            all_results[date_str] = {} # Store empty dict for this date
            continue

        results_for_date: Dict[str, Optional[Dict[str, Any]]] = {}

        # Inner loop: Iterate through schemes for the current date
        for scheme_name, ticker_weights in schemes_for_date.items():
            if not ticker_weights:
                logging.warning(f"  Skipping scheme '{scheme_name}': No ticker weights provided.")
                results_for_date[scheme_name] = None # Mark as skipped or failed
                continue

            try:
                # --- Call the actual backtest runner ---
                backtest_result = run_single_backtest(
                    selection_date=date_str,
                    scheme_name=scheme_name,
                    ticker_weights=ticker_weights,
                    df_adj_close=df_adj_close, # Pass the price data
                    risk_free_rate_daily = RISK_FREE_RATE_DAILY, # Default or configure elsewhere
                )
                results_for_date[scheme_name] = backtest_result
                # -----------------------------------------

            except Exception as e:
                logging.error(f"!! Error running backtest for {scheme_name} on {date_str}: {e}", exc_info=True)
                results_for_date[scheme_name] = None # Indicate failure

        all_results[date_str] = results_for_date

    logging.info("\n===== Batch Backtest Processing Finished =====")
    return all_results

# --- 3. Function to Extract Parameters and Results for Storage ---
def extract_params_and_results(
    params: Dict[str, Any],
    backtest_results_summary: Dict[str, Dict[str, Optional[Dict[str, Any]]]],
    run_timestamp: str,
    log_filepath: str
    ) -> List[Dict[str, Any]]:
    """
    Extracts relevant parameters and portfolio returns from results.

    Args:
        params (Dict[str, Any]): The dictionary of parameters used for this run.
        backtest_results_summary (Dict): The nested results dictionary from process_all_backtests.
        run_timestamp (str): Timestamp for the overall script execution.
        log_filepath (str): Path to the log file for this run.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries, each representing a single
                              backtest (date+scheme) with its parameters and results.
    """
    records = []
    for date_str, scheme_results in backtest_results_summary.items():
        for scheme_name, result in scheme_results.items():
            record = {
                'run_timestamp': run_timestamp,
                'log_file': os.path.basename(log_filepath),
                'selection_date': date_str,
                'scheme': scheme_name,
            }

            # Add tracked parameters from the input params dict
            for p_key in PARAMS_TO_TRACK:
                record[p_key] = params.get(p_key, None) # Use None if param missing

            # Add results (handle cases where backtest failed/returned None)
            if result and 'metrics' in result:
                record['portfolio_return'] = result['metrics'].get('portfolio_return', np.nan)
                record['num_successful_trades'] = result['metrics'].get('num_successful_trades', 0)
                record['total_weight_traded'] = result['metrics'].get('total_weight_traded', 0.0)
                record['win_rate'] = result['metrics'].get('win_rate', np.nan)
                record['average_return'] = result['metrics'].get('average_return', np.nan)
                # Add n_select_actual if it exists in the input params (from the result file)
                record['n_select_actual'] = params.get('n_select_actual', None)
            else:
                # Backtest failed or skipped for this scheme/date
                record['portfolio_return'] = np.nan
                record['num_successful_trades'] = 0
                record['total_weight_traded'] = 0.0
                record['win_rate'] = np.nan
                record['average_return'] = np.nan
                record['n_select_actual'] = params.get('n_select_actual', None) # Still try to get this

            records.append(record)
    return records

# --- 4. Function to Append Results to CSV ---
def append_results_to_csv(records: List[Dict[str, Any]], filepath: str = RESULTS_CSV_PATH):
    """Appends a list of result records to a CSV file."""
    if not records:
        logging.info("No records to append to CSV.")
        return

    df_new = pd.DataFrame(records)

    # Define column order explicitly for consistency
    # Include 'scheme' which represents the weighting method (EW, IV, SW)
    column_order = [
        'run_timestamp', 'log_file', 'selection_date', 'scheme',
        'n_select_requested', 'n_select_actual', # Keep actual close to requested
        'inv_vol_col_name', 'filter_min_price', 'filter_min_avg_volume_m',
        'filter_min_roe_pct', 'filter_max_debt_eq', 'score_weight_rsi',
        'score_weight_change', 'score_weight_rel_volume', 'score_weight_volatility',
        'portfolio_return', 'num_successful_trades', 'total_weight_traded',
        'win_rate', 'average_return'
    ]
    # Ensure all expected columns exist, add if missing (e.g., first run)
    for col in column_order:
        if col not in df_new.columns:
            df_new[col] = np.nan # Add missing columns with NaN

    # Reorder DataFrame columns
    df_new = df_new[column_order]

    try:
        # Append mode, write header only if file doesn't exist
        header = not os.path.exists(filepath)
        df_new.to_csv(filepath, mode='a', header=header, index=False)
        logging.info(f"Appended {len(records)} records to {filepath}")
    except Exception as e:
        logging.error(f"Error writing to CSV file {filepath}: {e}", exc_info=True)


# === Main Execution Block ===
if __name__ == "__main__":
    # --- Setup ---
    log_filepath = setup_logging() # Initialize logging
    run_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    all_performance_records = [] # List to hold records from all file pairs

    # --- Load Price Data (Assuming it's loaded once) ---
    # !!! IMPORTANT: You need to load your adjusted close price data here !!!
    # Example: Replace with your actual loading mechanism
    try:
        # Example: Load from a parquet file named 'adj_close_prices.parquet'
        # Ensure the index is Date/Timestamp and columns are Tickers
        adj_close_path = 'df_adj_close.parquet' # <<< CHANGE THIS
        if not os.path.exists(adj_close_path):
            raise FileNotFoundError(f"Price data file not found: {adj_close_path}")
        df_adj_close = pd.read_parquet(adj_close_path)

        # Basic validation
        if not isinstance(df_adj_close.index, pd.DatetimeIndex):
            df_adj_close.index = pd.to_datetime(df_adj_close.index)
        if not df_adj_close.index.is_monotonic_increasing:
            df_adj_close = df_adj_close.sort_index()
        logging.info(f"Successfully loaded and prepared price data from {adj_close_path}")
        logging.info(f"Price data shape: {df_adj_close.shape}, Date range: {df_adj_close.index.min()} to {df_adj_close.index.max()}")
    except FileNotFoundError as e:
        logging.critical(f"CRITICAL ERROR: {e}. Cannot proceed without price data.")
        exit() # Stop execution if price data is missing
    except Exception as e:
        logging.critical(f"CRITICAL ERROR loading price data: {e}", exc_info=True)
        exit()


    # --- Find Selection/Parameter File Pairs (Example - adapt to your structure) ---
    data_dir = 'output/selection_results/'
    try:
        all_files = os.listdir(data_dir)
        selection_files = sorted([f for f in all_files if f.startswith('2025') and f.endswith('.parquet')])
        param_files = sorted([f for f in all_files if f.startswith('2025') and f.endswith('.json')])

        # Basic matching logic (assumes corresponding dates)
        file_pairs = []
        param_map = {utils.extract_date_from_string(pf): pf for pf in param_files} # Map date to param file

        for sf in selection_files:
            date_str = utils.extract_date_from_string(sf)
            if date_str and date_str in param_map:
                file_pairs.append((sf, param_map[date_str]))
            else:
                logging.warning(f"Could not find matching param file for data file: {sf} (extracted date: {date_str})")

    except FileNotFoundError:
        logging.critical(f"Data directory not found: {data_dir}. Cannot find files.")
        file_pairs = [] # Ensure it's empty
    except Exception as e:
        logging.critical(f"Error listing files in {data_dir}: {e}", exc_info=True)
        file_pairs = []


    logging.info(f"\n--- Found {len(file_pairs)} Paired Data and Parameter Files ---")

    # --- Iterate Through File Pairs ---
    for data_file, param_file in file_pairs:
        logging.info(f"\nProcessing Pair: Data='{data_file}', Params='{param_file}'")
        date_str = utils.extract_date_from_string(data_file)
        if not date_str:
            logging.error(f"Skipping pair due to invalid date extraction from {data_file}")
            continue

        logging.info(f"Extracted date: {date_str}")

        try:
            # 1. Read parameters
            param_path = os.path.join(data_dir, param_file)
            with open(param_path, 'r', encoding='utf-8') as f:
                params = json.load(f)
                logging.info(f"Parameters loaded from {param_file}:")
                # Log parameters neatly using pprint to string buffer
                params_str = io.StringIO()
                pprint.pprint(params, stream=params_str)
                logging.info(params_str.getvalue())

                # --- ADD 'n_select_actual' to PARAMS_TO_TRACK if it exists ---
                # This parameter comes *from* the selection result, not user input,
                # but it's useful context for the backtest run.
                if 'n_select_actual' in params and 'n_select_actual' not in PARAMS_TO_TRACK:
                     # This check prevents adding it multiple times if script reruns partially
                     # It's generally better to define PARAMS_TO_TRACK fully at the start,
                     # but this handles the specific request.
                     # Or just add it manually to the PARAMS_TO_TRACK list initially.
                     logging.debug("Adding 'n_select_actual' to tracked parameters for this run.")
                     # This modification is local to this run's extraction logic
                     # via extract_params_and_results which uses the global list.
                     # A cleaner way is to ensure PARAMS_TO_TRACK includes it initially.
                     # We will handle it during extraction instead of modifying the global list here.
                     pass


            # 2. Read Selection DataFrame
            selection_path = os.path.join(data_dir, data_file)
            selection_df = pd.read_parquet(selection_path)
            logging.debug(f'Loaded selection_df from {data_file}. Shape: {selection_df.shape}') # DEBUG level

            # 3. Extract Backtest Setups
            backtest_setups = extract_backtest_setups(
                dataframe=selection_df,
                weight_column_names=['Weight_EW', 'Weight_IV', 'Weight_SW'], # Schemes to test
                date_str=date_str,
            )
            # Log setups less verbosely unless debugging
            setups_str = io.StringIO()
            pprint.pprint(backtest_setups, stream=setups_str, width=120) # Adjust width
            logging.debug("Extracted Backtest Setups:\n" + setups_str.getvalue()) # DEBUG level

            if not backtest_setups or not backtest_setups.get(date_str):
                 logging.warning(f"No valid backtest setups extracted for {date_str}. Skipping backtest run.")
                 continue

            # 4. Run Backtests for this date/parameter set
            logging.info(f"Running backtests for date: {date_str}")
            # Pass the globally loaded df_adj_close
            backtest_results_summary = process_all_backtests(backtest_setups, df_adj_close)

            # Log summary results less verbosely
            summary_str = io.StringIO()
            pprint.pprint(backtest_results_summary, stream=summary_str, width=120)
            logging.debug("\n--- Backtest Results Summary (for current date) ---\n" + summary_str.getvalue()) # DEBUG level

            # 5. Extract parameters and results for storage
            run_records = extract_params_and_results(
                params=params, # Pass the parameters loaded for this file pair
                backtest_results_summary=backtest_results_summary,
                run_timestamp=run_timestamp,
                log_filepath=log_filepath
            )
            all_performance_records.extend(run_records) # Collect records from all runs

            logging.info(f"Finished processing for {date_str}.")

        except FileNotFoundError as e:
            logging.error(f"Error finding file for pair ({data_file}, {param_file}): {e}")
        except Exception as e:
            logging.error(f"Unhandled error processing pair ({data_file}, {param_file}): {e}", exc_info=True)

        logging.info(f"{'=' * 40}\n") # Separator

    # --- 5. Save all collected results to CSV ---
    logging.info(f"\n--- Attempting to Save {len(all_performance_records)} Performance Records ---")
    append_results_to_csv(all_performance_records, RESULTS_CSV_PATH)

    logging.info("=== Script Execution Finished ===")    

In [None]:
logging.shutdown()
print("Logging system shut down.")