In [None]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# # --- Execute the processor ---
# import utils
# from config import date_str, DOWNLOAD_DIR, DEST_DIR

date_str = '2025-04-01'
print(f'\nSTOCK SELECTION DATE: {date_str}')



In [None]:
import pandas as pd
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 1000)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell
pd.set_option('display.max_rows', 200)
# pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.4f}'.format)

In [None]:
zscore_df = pd.read_parquet(f'..\data\{date_str}_zscore_df.parquet', engine='pyarrow')
cluster_stats_df = pd.read_parquet(f'..\data\{date_str}_cluster_stats_df.parquet', engine='pyarrow')
detailed_clusters_df = pd.read_parquet(f'..\data\{date_str}_detailed_clusters_df.parquet', engine='pyarrow')
# df_finviz_merged = pd.read_parquet(f'..\data\{date_str}_df_finviz_merged.parquet', engine='pyarrow')

In [None]:
detailed_clusters_df

In [None]:
# df_finviz_merged = pd.read_parquet(f'..\data\{date_str}_df_finviz_merged.parquet', engine='pyarrow')
df_OHLCV = pd.read_parquet(r'..\data\2025-04-17_df_OHLCV_clean.parquet', engine='pyarrow')
df_OHLCV

In [None]:
import pandas as pd # Assuming pandas is used

def select_stocks_from_clusters(cluster_stats_df, detailed_clusters_df,
                                select_top_n_clusters=3, max_selection_per_cluster=5,
                                min_cluster_size=5, penalty_IntraCluster_Corr=0.3,
                                date_str=date_str,
                                min_raw_score=None, # <-- Added argument
                                min_risk_adj_score=None): # <-- Added argument
    """
    Pipeline to select stocks from better performing clusters, with optional score thresholds.

    Parameters:
    - cluster_stats_df: DataFrame with cluster statistics.
    - detailed_clusters_df: DataFrame with detailed cluster information including
                            'Ticker', 'Cluster_ID', 'Raw_Score', 'Risk_Adj_Score', etc.
    - select_top_n_clusters: int, Number of top clusters to select (default=3).
    - max_selection_per_cluster: int, Max number of stocks to select from each cluster (default=5).
    - min_cluster_size: int, Minimum size for a cluster to be considered (default=5).
    - penalty_IntraCluster_Corr: float, Penalty weight for intra-cluster correlation in
                                     composite score (default=0.3).
    - date_str: str, Date string for tracking/parameter storage.
    - min_raw_score: float, optional (default=None)
        Minimum Raw_Score required for a stock to be considered for selection.
        If None, no threshold is applied based on Raw_Score.
    - min_risk_adj_score: float, optional (default=None)
        Minimum Risk_Adj_Score required for a stock to be considered for selection.
        If None, no threshold is applied based on Risk_Adj_Score.

    Returns:
    - dict: A dictionary containing:
        - 'selected_top_n_cluster_ids': List of top selected cluster IDs.
        - 'selected_stocks': DataFrame of selected stocks.
        - 'cluster_performance': DataFrame of selected cluster metrics.
        - 'parameters': Dictionary of the input parameters used.
    """

    # Store input parameters
    parameters = {
        'date_str': date_str,
        'select_top_n_clusters': select_top_n_clusters,
        'max_selection_per_cluster': max_selection_per_cluster,
        'min_cluster_size': min_cluster_size,
        'min_raw_score': min_raw_score,         # <-- Stored parameter
        'min_risk_adj_score': min_risk_adj_score, # <-- Stored parameter
        'penalty_IntraCluster_Corr': penalty_IntraCluster_Corr,
    }
    
    # ===== 1. Filter and Rank Clusters =====
    qualified_clusters = cluster_stats_df[cluster_stats_df['Size'] >= min_cluster_size].copy()
    if qualified_clusters.empty:
        print(f"Warning: No clusters met the minimum size criteria ({min_cluster_size}).")
        return {
            'selected_stocks': pd.DataFrame(),
            'cluster_performance': pd.DataFrame(),
            'parameters': parameters
        }

    qualified_clusters['Composite_Cluster_Score'] = (
        (1 - penalty_IntraCluster_Corr) * qualified_clusters['Avg_Raw_Score'] +
        penalty_IntraCluster_Corr * (1 - qualified_clusters['Avg_IntraCluster_Corr'])
    )
    ranked_clusters = qualified_clusters.sort_values('Composite_Cluster_Score', ascending=False)
    selected_clusters = ranked_clusters.head(select_top_n_clusters)
    cluster_ids = selected_clusters['Cluster_ID'].tolist()

    if not cluster_ids:
        print("Warning: No clusters were selected based on ranking.")
        return {
            'selected_stocks': pd.DataFrame(),
            'cluster_performance': selected_clusters, # Return empty selected clusters df
            'parameters': parameters
        }


    # ===== 2. Select Stocks from Each Cluster =====
    selected_stocks_list = []
    for cluster_id in cluster_ids:
        # Get all stocks for the current cluster
        cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id].copy()

        # ===> Apply Threshold Filters <===
        if min_raw_score is not None:
            cluster_stocks = cluster_stocks[cluster_stocks['Raw_Score'] >= min_raw_score]
        if min_risk_adj_score is not None:
            cluster_stocks = cluster_stocks[cluster_stocks['Risk_Adj_Score'] >= min_risk_adj_score]
        # ===> End of Added Filters <===

        # Proceed only if stocks remain after filtering
        if len(cluster_stocks) > 0:
            # Sort remaining stocks by Risk_Adj_Score and select top N
            top_stocks = cluster_stocks.sort_values('Risk_Adj_Score', ascending=False).head(max_selection_per_cluster)

            # Add cluster-level metrics to the selected stock rows
            cluster_metrics = selected_clusters[selected_clusters['Cluster_ID'] == cluster_id].iloc[0]
            for col in ['Composite_Cluster_Score', 'Avg_IntraCluster_Corr', 'Avg_Volatility',
                      'Avg_Raw_Score', 'Avg_Risk_Adj_Score', 'Size']: # Added Size for context
                # Use .get() for safety if a column might be missing
                top_stocks[f'Cluster_{col}'] = cluster_metrics.get(col, None)
            selected_stocks_list.append(top_stocks)

    # Consolidate selected stocks
    if selected_stocks_list:
        selected_stocks = pd.concat(selected_stocks_list)
        # Recalculate weights based on the final selection
        if selected_stocks['Risk_Adj_Score'].sum() != 0:
             selected_stocks['Weight'] = (selected_stocks['Risk_Adj_Score'] /
                                          selected_stocks['Risk_Adj_Score'].sum())
        else:
             # Handle case where all selected scores are zero (unlikely but possible)
             selected_stocks['Weight'] = 1 / len(selected_stocks) if len(selected_stocks) > 0 else 0

        selected_stocks = selected_stocks.sort_values(['Cluster_ID', 'Risk_Adj_Score'],
                                                    ascending=[True, False])
    else:
        selected_stocks = pd.DataFrame()
        print("Warning: No stocks met selection criteria (including score thresholds if applied).")


    # ===== 3. Prepare Enhanced Output Reports =====
    cluster_performance = selected_clusters.copy()
    # Calculate how many stocks were actually selected per cluster after filtering
    cluster_performance['Stocks_Selected'] = cluster_performance['Cluster_ID'].apply(
        lambda x: len(selected_stocks[selected_stocks['Cluster_ID'] == x]) if not selected_stocks.empty else 0)

    if not selected_stocks.empty:
         # Ensure Avg_IntraCluster_Corr exists before calculating diversification
        if 'Avg_IntraCluster_Corr' in cluster_performance.columns:
             cluster_performance['Intra_Cluster_Diversification'] = 1 - cluster_performance['Avg_IntraCluster_Corr']
        else:
             cluster_performance['Intra_Cluster_Diversification'] = pd.NA # Or None
    else:
      # Handle case where selected_stocks is empty
        cluster_performance['Intra_Cluster_Diversification'] = pd.NA # Or None

    # ===> Package results and parameters
    results_bundle = {
        'selected_top_n_cluster_ids': cluster_ids,
        'selected_stocks': selected_stocks,
        'cluster_performance': cluster_performance,
        'parameters': parameters
    }

    return results_bundle


In [None]:
import pandas as pd
from typing import Dict, Any

def print_stock_selection_report(output: Dict[str, Any]) -> None:
    """
    Prints a detailed report summarizing the results of the stock selection process,
    extracting all necessary information from the output dictionary.

    Args:
        output (Dict[str, Any]): The dictionary returned by the
                                 select_stocks_from_clusters function, containing:
                                 - 'selected_stocks': DataFrame of selected stocks.
                                 - 'cluster_performance': DataFrame of selected cluster metrics.
                                #  - 'parameters': Dictionary of the input parameters used.
                                #  - 'cluster_stats_df': Original cluster stats DataFrame.
                                #  - 'detailed_clusters_df': Original detailed clusters DataFrame.
    Returns:
        None: This function prints output to the console.
    """
    # Extract data from the output dictionary using .get() for safety
    selected_stocks = output.get('selected_stocks', pd.DataFrame())
    cluster_performance = output.get('cluster_performance', pd.DataFrame())
    used_params = output.get('parameters', {})
    # Extract the input DataFrames needed for the report
    # cluster_stats_df = output.get('input_cluster_stats_df') # Might be None
    cluster_stats_df = output.get('cluster_stats_df') # Might be None
    # detailed_clusters_df = output.get('input_detailed_clusters_df') # Might be None
    detailed_clusters_df = output.get('detailed_clusters_df') # Might be None

    # --- Start of Original Code Block (adapted) ---

    print("\n=== CLUSTER SELECTION CRITERIA ===")
    print("* Using Composite_Cluster_Score (balancing Raw Score and diversification) for cluster ranking.")
    print("* Using Risk_Adj_Score for stock selection within clusters.")

    num_selected_clusters = len(cluster_performance) if not cluster_performance.empty else 0
    # Use the extracted cluster_stats_df
    total_clusters = len(cluster_stats_df) if cluster_stats_df is not None and not cluster_stats_df.empty else 'N/A'

    print(f"* Selected top {num_selected_clusters} clusters from {total_clusters} total initial clusters.") # Adjusted wording slightly
    print(f"* Selection Criteria:")
    if used_params:
        for key, value in used_params.items():
            # Avoid printing the large input dataframes stored in parameters if they were added there too
            if not isinstance(value, pd.DataFrame):
                 print(f"    {key}: {value}")
    else:
        print("    Parameters not available.")


    if not cluster_performance.empty:
        print("\n=== SELECTED CLUSTERS (RANKED BY COMPOSITE SCORE) ===")
        display_cols_exist = [col for col in [
                                'Cluster_ID', 'Size', 'Avg_Raw_Score', 'Avg_Risk_Adj_Score',
                                'Avg_IntraCluster_Corr', 'Avg_Volatility', 'Composite_Cluster_Score',
                                'Stocks_Selected', 'Intra_Cluster_Diversification']
                              if col in cluster_performance.columns]
        print(cluster_performance[display_cols_exist].sort_values('Composite_Cluster_Score', ascending=False).to_string(index=False))

        # Print top 8 stocks by Raw_Score for each selected cluster
        # Check if detailed_clusters_df was successfully extracted
        if detailed_clusters_df is not None and not detailed_clusters_df.empty:
            print("\n=== TOP STOCKS BY RAW SCORE PER SELECTED CLUSTER ===")
            print("""* Volatility is the standard deviation of daily returns over the past 250 trading days (example context).
* Note: The stocks below are shown ranked by Raw_Score for analysis,
*       but actual selection within the cluster was based on Risk_Adj_Score.""")

            for cluster_id in cluster_performance['Cluster_ID']:
                 cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id]
                 if not cluster_stocks.empty:
                    required_cols = ['Ticker', 'Raw_Score', 'Risk_Adj_Score', 'Volatility']
                    if all(col in cluster_stocks.columns for col in required_cols):
                        top_raw = cluster_stocks.nlargest(8, 'Raw_Score')[required_cols]

                        print(f"\nCluster {cluster_id} - Top 8 by Raw Score:")
                        print(top_raw.to_string(index=False))
                        cluster_avg_raw = cluster_performance.loc[cluster_performance['Cluster_ID'] == cluster_id, 'Avg_Raw_Score'].values
                        cluster_avg_risk = cluster_performance.loc[cluster_performance['Cluster_ID'] == cluster_id, 'Avg_Risk_Adj_Score'].values
                        if len(cluster_avg_raw) > 0: print(f"Cluster Avg Raw Score: {cluster_avg_raw[0]:.2f}")
                        if len(cluster_avg_risk) > 0: print(f"Cluster Avg Risk Adj Score: {cluster_avg_risk[0]:.2f}")
                    else:
                        print(f"\nCluster {cluster_id} - Missing required columns in detailed_clusters_df to show top stocks.")
                 else:
                     print(f"\nCluster {cluster_id} - No stocks found in detailed_clusters_df for this cluster.")
        else:
            print("\n=== TOP STOCKS BY RAW SCORE PER SELECTED CLUSTER ===")
            print("Skipping - Detailed cluster information ('input_detailed_clusters_df') not found in the output dictionary.")

    else:
        print("\n=== SELECTED CLUSTERS ===")
        print("No clusters were selected based on the criteria.")


    print(f"\n=== FINAL SELECTED STOCKS (FILTERED & WEIGHTED) ===")
    if not selected_stocks.empty:
        print("* Stocks actually selected based on Risk_Adj_Score (and optional thresholds) within each cluster.")
        print("* Position weights assigned based on Risk_Adj_Score within the final selected portfolio.")

        desired_cols = ['Cluster_ID', 'Ticker', 'Raw_Score', 'Risk_Adj_Score',
                        'Volatility', 'Weight',
                        'Cluster_Avg_Raw_Score', 'Cluster_Avg_Risk_Adj_Score']
        available_cols = [col for col in desired_cols if col in selected_stocks.columns]
        print(selected_stocks[available_cols].sort_values(['Cluster_ID', 'Risk_Adj_Score'],
                                                        ascending=[True, False]).to_string(index=False))

        print("\n=== PORTFOLIO SUMMARY ===")
        print(f"Total Stocks Selected: {len(selected_stocks)}")
        print(f"Average Raw Score: {selected_stocks.get('Raw_Score', pd.Series(dtype=float)).mean():.2f}")
        print(f"Average Risk-Adjusted Score: {selected_stocks.get('Risk_Adj_Score', pd.Series(dtype=float)).mean():.2f}")
        print(f"Average Volatility: {selected_stocks.get('Volatility', pd.Series(dtype=float)).mean():.2f}")
        print(f"Total Weight (should be close to 1.0): {selected_stocks.get('Weight', pd.Series(dtype=float)).sum():.4f}")
        print("\nCluster Distribution:")
        print(selected_stocks['Cluster_ID'].value_counts().to_string())
    else:
        print("No stocks were selected after applying all filters and criteria.")


In [None]:
import pandas as pd
import numpy as np
import itertools # Import the itertools module

# --- Define Factor Ranges ---
# Generate the factors using numpy.arange for float steps
# Add a small epsilon to the end value to ensure the endpoint is included due to float precision
raw_score_factors = np.arange(0.5, 1.2 + 0.01, 0.1)
risk_adj_score_factors = np.arange(0.5, 1.2 + 0.01, 0.1)
penalty_factors = np.arange(0, 0.4 + 0.01, 0.1) # New factor range

print("--- Parameter Ranges ---")
print(f"Raw Score Factors: {np.round(raw_score_factors,1)}")
print(f"Risk Adj Score Factors: {np.round(risk_adj_score_factors,1)}")
print(f"Penalty Factors: {np.round(penalty_factors,1)}")


# --- Generate All Combinations ---
# Use itertools.product to create an iterator of all combinations
parameter_combinations = list(itertools.product(raw_score_factors, risk_adj_score_factors, penalty_factors))
total_combinations = len(parameter_combinations)
print(f"\nTotal parameter combinations to iterate: {total_combinations}")


# --- Store results ---
all_portfolios = {} # Dictionary to store portfolios by name

# --- Fixed Parameters (that don't vary in this loop) ---
select_top_n_clusters = 60
max_selection_per_cluster = 2
min_cluster_size = 3  # prevent extreme high risk adj scores
# You might want to get the date dynamically
portf_date_base = date_str # Example date, adjust as needed (Ensure date_str is defined before this)

# --- Iteration Loop (Single Loop over Combinations) ---
print("\nStarting portfolio generation loop...")
for i, (raw_scale, risk_adj_scale, penalty) in enumerate(parameter_combinations):

    # --- Calculate dynamic parameters based on current factors ---
    # Round factors slightly to avoid potential floating point representation issues in calculations/names
    raw_scale = round(raw_scale, 1)
    risk_adj_scale = round(risk_adj_scale, 1)
    penalty = round(penalty, 1) # Round the new penalty factor as well

    min_raw_score = 2.0 * raw_scale
    min_risk_adj_score = 100.0 * risk_adj_scale
    # penalty_IntraCluster_Corr is now the 'penalty' variable from the combination

    print(f"\nRunning combination {i+1}/{total_combinations}: "
          f"raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}")
    print(f"Resulting thresholds: min_raw_score={min_raw_score:.2f}, min_risk_adj_score={min_risk_adj_score:.1f}")

    # --- Run the selection pipeline ---
    try:
        output = select_stocks_from_clusters(
            cluster_stats_df=cluster_stats_df,      # Ensure this DataFrame is loaded/defined
            detailed_clusters_df=detailed_clusters_df, # Ensure this DataFrame is loaded/defined
            select_top_n_clusters=select_top_n_clusters,
            max_selection_per_cluster=max_selection_per_cluster,
            min_cluster_size=min_cluster_size,
            penalty_IntraCluster_Corr=penalty, # Use the penalty from the current combination
            min_raw_score=min_raw_score,
            min_risk_adj_score=min_risk_adj_score,
            date_str=portf_date_base # Pass the date
        )

        # --- Process and Store Results ---
        portf_date = output['parameters']['date_str']
        portf_raw_score_val = output['parameters']['min_raw_score'] # Use the value returned by the function
        portf_risk_adj_score_val = output['parameters']['min_risk_adj_score'] # Use the value returned by the function
        portf_penalty_val = output['parameters']['penalty_IntraCluster_Corr'] # Get penalty used
        _selected_stocks = output['selected_stocks']

        if _selected_stocks is not None and not _selected_stocks.empty:
            portf_selected_stocks = _selected_stocks.set_index('Ticker')[['Weight']] # Select only Weight column after setting index

            # Format the name using the factors - now includes penalty
            portf_name = f'{portf_date}_portf_rawF_{raw_scale:.1f}_riskAdjF_{risk_adj_scale:.1f}_pen_{penalty:.1f}'
            # Alternative using resulting thresholds and penalty:
            # portf_name = f'{portf_date}_portf_raw_{portf_raw_score_val:.2f}_riskadj_{portf_risk_adj_score_val:.1f}_pen_{portf_penalty_val:.1f}'

            print(f'Generated Portfolio:')
            print(f'  Name: {portf_name}')
            print(f'  Number of stocks: {len(portf_selected_stocks)}')
            # print(f'port_selected_stocks:\n{portf_selected_stocks}') # Can be verbose

            # Store the results - using the portfolio name as the key
            all_portfolios[portf_name] = {
                'parameters': {
                    'raw_score_scale_factor': raw_scale,
                    'risk_adj_score_scale_factor': risk_adj_scale,
                    'penalty_IntraCluster_Corr': penalty, # Store the penalty factor used
                    'min_raw_score': portf_raw_score_val,
                    'min_risk_adj_score': portf_risk_adj_score_val,
                    'select_top_n_clusters': select_top_n_clusters,
                    'max_selection_per_cluster': max_selection_per_cluster,
                    'min_cluster_size': min_cluster_size,
                    'date': portf_date
                },
                'selected_stocks': portf_selected_stocks
            }
        else:
             print(f"No stocks selected for raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}")

    except Exception as e:
        print(f"ERROR processing combination raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}: {e}")
        # Decide if you want to continue or stop on error - currently continues

print(f"\n--- Portfolio Generation Complete ---")
print(f"Generated {len(all_portfolios)} portfolios out of {total_combinations} combinations attempted.")



In [None]:
all_portfolios

### AI Code


In [None]:
import pandas as pd
import numpy as np
import sys # Needed for sys.exit
import traceback # Import traceback for detailed error printing
import os

# --- Parameters ---
ADJ_CLOSE_COL = 'Adj Close'
POTENTIAL_TICKER_COLS = ['Symbol', 'Ticker']
POTENTIAL_DATE_COL = 'Date'
OUTPUT_RETURNS_CSV = 'portfolio_returns.csv'
OUTPUT_SUMMARY_CSV = 'portfolio_factor_performance.csv'

print("--- Portfolio Performance Calculation Script ---")

# =============================================================================
# 1. Data and Index Preparation
# =============================================================================
print("\nChecking and preparing df_OHLCV index structure...")
ticker_level_name = None
date_level_name = None
try:
    # Check if df_OHLCV already has a suitable MultiIndex
    if isinstance(df_OHLCV.index, pd.MultiIndex) and len(df_OHLCV.index.levels) >= 2:
        actual_names = list(df_OHLCV.index.names)
        print(f"Detected MultiIndex with names: {actual_names}")
        ticker_level_name = actual_names[0] if actual_names[0] is not None else 0
        date_level_name = actual_names[1] if actual_names[1] is not None else 1
        if isinstance(ticker_level_name, int) or isinstance(date_level_name, int):
             print(f"Warning: Using positional index levels ({ticker_level_name}, {date_level_name}) as names were missing.")
    else:
        print("Index is not a MultiIndex or has too few levels. Checking columns...")
        ticker_col_to_use = None
        for col in POTENTIAL_TICKER_COLS:
            if col in df_OHLCV.columns:
                ticker_col_to_use = col
                break
        date_col_to_use = POTENTIAL_DATE_COL if POTENTIAL_DATE_COL in df_OHLCV.columns else None
        if ticker_col_to_use and date_col_to_use:
            required_cols = [ticker_col_to_use, date_col_to_use]
            print(f"Attempting to set index using columns: {required_cols}...")
            # Ensure df_OHLCV exists and is a DataFrame before modification
            if 'df_OHLCV' in locals() and isinstance(df_OHLCV, pd.DataFrame):
                 df_OHLCV = df_OHLCV.set_index(required_cols)
                 ticker_level_name = required_cols[0]
                 date_level_name = required_cols[1]
                 print(f"MultiIndex set successfully.")
            else:
                 print("ERROR: df_OHLCV not defined or not a DataFrame before setting index.")
                 sys.exit("Exiting due to data structure issue.")

        else:
            missing = []
            if not ticker_col_to_use: missing.extend(POTENTIAL_TICKER_COLS)
            if not date_col_to_use: missing.append(POTENTIAL_DATE_COL)
            print(f"ERROR: Cannot set index. Required columns for Ticker ({POTENTIAL_TICKER_COLS}) or Date ({POTENTIAL_DATE_COL}) not found.")
            if 'df_OHLCV' in locals() and isinstance(df_OHLCV, pd.DataFrame):
                 print("df_OHLCV columns:", df_OHLCV.columns)
            else:
                 print("df_OHLCV is not defined or not a DataFrame.")
            sys.exit("Exiting due to incorrect df_OHLCV structure.")

    print(f"Using '{ticker_level_name}' for ticker level and '{date_level_name}' for date level.")

    if 'df_OHLCV' not in locals() or not isinstance(df_OHLCV, pd.DataFrame):
        print("ERROR: df_OHLCV is not defined or not a DataFrame.")
        sys.exit("Exiting due to missing data.")

    if ADJ_CLOSE_COL not in df_OHLCV.columns:
        print(f"ERROR: Required price column '{ADJ_CLOSE_COL}' not found in df_OHLCV columns: {df_OHLCV.columns}")
        sys.exit("Exiting due to missing price column.")

    # Convert date level to datetime if it's not already
    date_level_values = df_OHLCV.index.get_level_values(date_level_name)
    if not pd.api.types.is_datetime64_any_dtype(date_level_values):
        print(f"Converting date level '{date_level_name}' to datetime objects...")
        # Create a new index with the converted dates
        new_levels = list(df_OHLCV.index.levels)
        date_level_idx = df_OHLCV.index.names.index(date_level_name) # Find position of date level
        new_levels[date_level_idx] = pd.to_datetime(date_level_values.unique()) # Convert unique dates
        # Need to reconstruct the index carefully, preserving levels and codes
        new_codes = list(df_OHLCV.index.codes)
        new_index = pd.MultiIndex(levels=new_levels, codes=new_codes, names=df_OHLCV.index.names)
        df_OHLCV.index = new_index
        print("Date level converted.")

    if not df_OHLCV.index.is_monotonic_increasing:
         print("Sorting df_OHLCV index...")
         df_OHLCV = df_OHLCV.sort_index()
         print("Index sorted.")

except Exception as e:
    print(f"ERROR: An unexpected error occurred during index check/setup: {e}")
    traceback.print_exc()
    sys.exit("Exiting due to index setup error.")

# --- ADD THIS: Get all unique tickers from df_OHLCV ---
all_known_tickers_set = set()
if 'df_OHLCV' in locals() and isinstance(df_OHLCV, pd.DataFrame) and ticker_level_name is not None:
    try:
        all_known_tickers = df_OHLCV.index.get_level_values(ticker_level_name).unique().tolist()
        all_known_tickers_set = set(all_known_tickers)
        print(f"\nFound {len(all_known_tickers_set):,} unique tickers in df_OHLCV data.")
        if not all_known_tickers_set:
             print("Warning: No tickers found in df_OHLCV index. Price fetching will likely fail.")
    except Exception as e:
        print(f"ERROR: Could not extract unique tickers from df_OHLCV index level '{ticker_level_name}': {e}")
        sys.exit("Exiting due to error extracting ticker list.")
else:
     print("Warning: df_OHLCV not available or ticker level name not set during initialization. Cannot pre-check ticker existence.")
# --- END OF ADDITION ---

try:
    all_dates_in_data = df_OHLCV.index.get_level_values(date_level_name).unique()
    trading_dates = pd.DatetimeIndex(pd.to_datetime(all_dates_in_data)).sort_values()
    if not trading_dates.empty:
         print(f"Trading dates extracted using level '{date_level_name}'. Found {len(trading_dates)} unique dates (e.g., {trading_dates[0].date()} to {trading_dates[-1].date()}).")
    else:
         print(f"ERROR: No dates found in the index level '{date_level_name}'. Cannot proceed.")
         sys.exit("Exiting - Cannot determine trading dates.")
except (KeyError, IndexError) as e:
    print(f"ERROR: Could not find or access level '{date_level_name}' to extract trading dates: {e}")
    sys.exit("Exiting - Cannot determine trading dates.")
except Exception as e:
    print(f"ERROR: An unexpected error occurred extracting trading dates: {e}")
    traceback.print_exc()
    sys.exit("Exiting - Cannot determine trading dates.")

# =============================================================================
# 2. Helper Function for Trading Dates (NO CHANGE NEEDED HERE)
# =============================================================================
# ... (get_next_trading_date function remains the same) ...
def get_next_trading_date(current_date, sorted_trading_dates):
    if not isinstance(sorted_trading_dates, pd.DatetimeIndex) or not sorted_trading_dates.is_monotonic_increasing:
        try:
            sorted_trading_dates = pd.DatetimeIndex(sorted_trading_dates).sort_values()
        except Exception: return None
    if sorted_trading_dates.empty: return None
    current_date = pd.Timestamp(current_date)
    try:
        loc = sorted_trading_dates.searchsorted(current_date, side='right')
        return sorted_trading_dates[loc] if loc < len(sorted_trading_dates) else None
    except Exception: return None

# =============================================================================
# 3. Portfolio Return Calculation Loop
# =============================================================================
portfolio_returns_data = []
print("\nCalculating portfolio returns for the current run...")
index_mismatch_warning_shown = False

if 'all_portfolios' not in locals() or not isinstance(all_portfolios, dict):
    print("ERROR: 'all_portfolios' dictionary is not defined.")
    all_portfolios = {}

if not all_portfolios:
    print("Warning: 'all_portfolios' dictionary is empty. No returns to calculate.")
else:
    processed_count = 0
    skipped_count = 0
    error_count = 0
    for portf_name, portf_data in list(all_portfolios.items()):
        try:
            # --- 3.1 Extract Parameters and Holdings ---
            params = portf_data.get('parameters')
            selected_stocks_df = portf_data.get('selected_stocks') # DataFrame with Ticker/Symbol as index, 'Weight' column

            # Basic validation
            if not isinstance(params, dict) or not isinstance(selected_stocks_df, pd.DataFrame) or 'Weight' not in selected_stocks_df.columns:
                print(f"Warning: Skipping {portf_name} - Invalid structure or missing 'Weight' column.")
                skipped_count += 1
                continue
            # Ensure index has a name before comparison
            if selected_stocks_df.index.name is None:
                # Try to guess based on potential names or default to 'Ticker'/'Symbol'
                potential_index_names = [name for name in POTENTIAL_TICKER_COLS if name in selected_stocks_df.index.name or selected_stocks_df.index.name is None]
                if potential_index_names:
                    selected_stocks_df.index.name = potential_index_names[0] # Use the first match
                else:
                    selected_stocks_df.index.name = 'Ticker' # Default if none match
                # print(f"Debug: Assigned index name '{selected_stocks_df.index.name}' to selected_stocks_df for {portf_name}")


            portf_date_str = params.get('date')
            raw_factor = params.get('raw_score_scale_factor')
            risk_adj_factor = params.get('risk_adj_score_scale_factor')
            penalty_factor = params.get('penalty_IntraCluster_Corr')

            if portf_date_str is None or raw_factor is None or risk_adj_factor is None or penalty_factor is None:
                print(f"Warning: Skipping {portf_name} - Missing essential parameter.")
                skipped_count += 1
                continue

            portf_date = pd.Timestamp(portf_date_str)

            if selected_stocks_df.empty:
                skipped_count += 1
                continue

            # --- 3.2 Ensure Portfolio Index Matches Data Index ---
            if selected_stocks_df.index.name != ticker_level_name:
                if not index_mismatch_warning_shown:
                    print(f"Warning: Portfolio index name ('{selected_stocks_df.index.name}') mismatches data index ('{ticker_level_name}'). Renaming will occur.")
                    index_mismatch_warning_shown = True
                try:
                    selected_stocks_df = selected_stocks_df.copy() # Avoid SettingWithCopyWarning
                    selected_stocks_df.index.name = ticker_level_name
                except Exception as rename_e:
                    print(f"Error: Failed to rename portfolio index for {portf_name}: {rename_e}. Skipping.")
                    skipped_count += 1
                    continue

            original_portfolio_tickers = selected_stocks_df.index.tolist()
            if not original_portfolio_tickers: # Double check after potential renaming/copying
                 print(f"Info: Skipping {portf_name} - Portfolio has no tickers after index handling.")
                 skipped_count += 1
                 continue

            # --- ADD THIS CHECK: Verify tickers exist in df_OHLCV ---
            tickers_in_portfolio_set = set(original_portfolio_tickers)
            missing_from_ohlcv = tickers_in_portfolio_set - all_known_tickers_set # Find tickers in portfolio but not in OHLCV data

            if missing_from_ohlcv:
                print(f"Warning for {portf_name}: The following tickers selected for the portfolio are completely missing from the historical price data (df_OHLCV) and will be dropped: {sorted(list(missing_from_ohlcv))}")
                # Filter the list of tickers to proceed with
                tickers_to_process = [t for t in original_portfolio_tickers if t in all_known_tickers_set]

                if not tickers_to_process:
                    print(f"Info: Skipping {portf_name} - No tickers remaining after removing those completely missing from price data.")
                    skipped_count += 1
                    continue # Skip to the next portfolio

                # IMPORTANT: Filter the selected_stocks_df as well to keep weights consistent
                selected_stocks_df = selected_stocks_df.loc[tickers_to_process]
                original_portfolio_tickers = tickers_to_process # Update the list for downstream use
            # --- END OF ADDED CHECK ---


            # --- 3.3 Determine Buy and Sell Dates ---
            buy_date = get_next_trading_date(portf_date, trading_dates)
            if buy_date is None:
                skipped_count += 1
                continue
            sell_date = get_next_trading_date(buy_date, trading_dates)
            if sell_date is None:
                skipped_count += 1
                continue
            required_dates = [buy_date, sell_date]

            # --- 3.4 Fetch Price Data & Filter Available Tickers ---
            # Use the potentially filtered 'original_portfolio_tickers'
            idx = pd.IndexSlice
            try:
                 # Attempt to slice; use reindex().dropna() for robustness if direct slice fails often
                 potential_data = df_OHLCV.loc[idx[original_portfolio_tickers, required_dates], :]
            except KeyError:
                 # Handle cases where some ticker/date combinations might be missing even if ticker exists overall
                 # Use reindex for safer access
                 multi_idx = pd.MultiIndex.from_product([original_portfolio_tickers, required_dates], names=[ticker_level_name, date_level_name])
                 potential_data = df_OHLCV.reindex(multi_idx) # This will have NaNs where data is missing

            # Identify tickers that have *valid* data for BOTH required dates
            # Group by ticker and check count of non-NA Adj Close prices for the required dates
            ticker_date_counts = potential_data.dropna(subset=[ADJ_CLOSE_COL])\
                                               .index.get_level_values(ticker_level_name)\
                                               .value_counts()

            available_tickers = ticker_date_counts[ticker_date_counts == len(required_dates)].index.tolist()

            if not available_tickers:
                # print(f"Info: Skipping {portf_name} - No stocks found with data for BOTH Buy({buy_date.date()}) and Sell({sell_date.date()}) dates.")
                skipped_count += 1
                continue

            # --- 3.5 Process Prices for Available Tickers ---
            try:
                # Select 'Adj Close' for the available tickers and dates from potential_data
                idx_slice = pd.IndexSlice[available_tickers, required_dates]
                prices = potential_data.loc[idx_slice, ADJ_CLOSE_COL]

                prices_unstacked = prices.unstack(level=date_level_name)
                prices_unstacked = prices_unstacked.rename(columns={buy_date: 'Buy Price', sell_date: 'Sell Price'})
                prices_unstacked = prices_unstacked.dropna() # Drop rows with NaN in Buy or Sell Price

                if prices_unstacked.empty:
                    skipped_count += 1
                    continue

                final_available_tickers = prices_unstacked.index.tolist()

            except KeyError as e:
                 print(f"Error: KeyError during price processing for {portf_name}: {e}. Check index slicing or column '{ADJ_CLOSE_COL}'. Skipping.")
                 skipped_count += 1
                 continue
            except Exception as e:
                 print(f"Error: Unexpected error during price processing for {portf_name}: {e}. Skipping.")
                 traceback.print_exc()
                 skipped_count += 1
                 continue

            # --- 3.6 Calculate Portfolio Return ---
            # Align weights using the *original* selected_stocks_df (which was filtered earlier if tickers were missing from OHLCV)
            # but index it with the tickers that *actually have price data* for the calculation period.
            aligned_weights = selected_stocks_df.loc[final_available_tickers, 'Weight']

            # ---> Weight Handling happens HERE <---
            weight_sum = aligned_weights.sum()
            if weight_sum <= 1e-9:
                 skipped_count += 1
                 continue
            normalized_weights = aligned_weights / weight_sum
            # ---> End of Weight Handling <---

            buy_prices = prices_unstacked.loc[final_available_tickers, 'Buy Price']
            sell_prices = prices_unstacked.loc[final_available_tickers, 'Sell Price']

            # ... (rest of calculation: handling zero prices, calculating return) ...
            zero_price_mask = (buy_prices.abs() < 1e-9)
            if zero_price_mask.any():
                # ... (handle zero prices and re-normalize weights if needed) ...
                valid_price_mask = ~zero_price_mask
                buy_prices = buy_prices[valid_price_mask]
                sell_prices = sell_prices[valid_price_mask]
                normalized_weights = normalized_weights.loc[valid_price_mask] # Align weights again
                if buy_prices.empty:
                    skipped_count += 1
                    continue
                # Re-normalize weights after removing zero-price stocks
                weight_sum = normalized_weights.sum()
                if weight_sum <= 1e-9:
                    skipped_count += 1
                    continue
                normalized_weights = normalized_weights / weight_sum # Renormalize again

            if buy_prices.isnull().any() or sell_prices.isnull().any() or normalized_weights.isnull().any() or \
               np.isinf(buy_prices).any() or np.isinf(sell_prices).any() or np.isinf(normalized_weights).any():
                print(f"Warning: Skipping {portf_name} - Found NaN or Inf in final arrays before calculation.")
                skipped_count += 1
                continue

            individual_returns = (sell_prices / (buy_prices + 1e-12)) - 1
            portfolio_return = (individual_returns * normalized_weights).sum()

            # --- 3.7 Store Results ---
            portfolio_returns_data.append({
                'portfolio_name': portf_name,
                'generation_date': portf_date,
                'buy_date': buy_date,
                'sell_date': sell_date,
                'raw_factor': raw_factor,
                'risk_adj_factor': risk_adj_factor,
                'penalty_factor': penalty_factor,
                'portfolio_return': portfolio_return,
                # Use len(selected_stocks_df.index) *before* aligning with final_available_tickers
                # if you want to capture the count *after* removing OHLCV-missing tickers but *before* date filtering.
                # Or keep the absolute original count if needed. Let's store both potentially:
                'num_stocks_selected': len(tickers_in_portfolio_set), # Count from original selection
                'num_stocks_in_ohlcv': len(original_portfolio_tickers), # Count after removing those missing from OHLCV
                'num_stocks_calc': len(final_available_tickers) # Count actually used in calculation (passed all filters)
            })
            processed_count += 1

        except Exception as e:
            print(f"FATAL Error processing portfolio {portf_name}: {e}")
            traceback.print_exc()
            error_count += 1
            # Decide whether to continue or stop
            # continue

    # ... (print summary counts: processed, skipped, error) ...
    print(f"\nCurrent run portfolio return calculation finished.")
    print(f"Successfully processed: {processed_count}")
    print(f"Skipped (missing data/criteria/tickers): {skipped_count}")
    print(f"Errors during processing: {error_count}")

# =============================================================================
# 4. Load Historical Data, Combine, and Analyze
# =============================================================================
print("\n--- Loading Historical Data and Performing Combined Analysis ---")

all_returns_df = pd.DataFrame() # Initialize an empty DataFrame

# --- Load existing historical data ---
if os.path.exists(OUTPUT_RETURNS_CSV):
    try:
        print(f"Loading historical data from {OUTPUT_RETURNS_CSV}...")
        historical_returns_df = pd.read_csv(OUTPUT_RETURNS_CSV, parse_dates=['generation_date', 'buy_date', 'sell_date'])
        # Optional: Add checks for expected columns
        expected_cols = ['generation_date', 'buy_date', 'sell_date', 'raw_factor', 'risk_adj_factor', 'penalty_factor', 'portfolio_return']
        if all(col in historical_returns_df.columns for col in expected_cols):
            all_returns_df = historical_returns_df
            print(f"Loaded {len(all_returns_df)} historical records.")
        else:
            print(f"Warning: Historical data file {OUTPUT_RETURNS_CSV} is missing expected columns. Ignoring historical data.")
            # Decide if you want to rename/backup the bad file
            # os.rename(OUTPUT_RETURNS_CSV, OUTPUT_RETURNS_CSV + ".bad_format")

    except pd.errors.EmptyDataError:
        print(f"Historical data file {OUTPUT_RETURNS_CSV} is empty. Starting fresh.")
    except Exception as e:
        print(f"Error loading historical data from {OUTPUT_RETURNS_CSV}: {e}")
        print("Warning: Proceeding without historical data.")
else:
    print(f"Historical data file {OUTPUT_RETURNS_CSV} not found. Starting fresh.")

# --- Combine with current run's data ---
if portfolio_returns_data:
    current_returns_df = pd.DataFrame(portfolio_returns_data)
    print(f"Combining {len(current_returns_df)} new records with historical data.")

    # Ensure date columns are datetime objects in the new data
    for col in ['generation_date', 'buy_date', 'sell_date']:
         if col in current_returns_df.columns:
              current_returns_df[col] = pd.to_datetime(current_returns_df[col])

    # Concatenate old and new data
    if not all_returns_df.empty:
        all_returns_df = pd.concat([all_returns_df, current_returns_df], ignore_index=True)
    else:
        all_returns_df = current_returns_df

    # Optional: Remove potential duplicates based on key identifiers if runs might overlap accidentally
    key_cols = ['generation_date', 'buy_date', 'sell_date', 'raw_factor', 'risk_adj_factor', 'penalty_factor']
    initial_len = len(all_returns_df)
    all_returns_df = all_returns_df.drop_duplicates(subset=key_cols, keep='last')
    if len(all_returns_df) < initial_len:
        print(f"Removed {initial_len - len(all_returns_df)} duplicate records based on key columns.")

else:
    print("No new portfolio returns were calculated in this run.")

# --- Perform analysis ONLY if there's data ---
if not all_returns_df.empty:
    # Ensure correct data types before grouping (especially factors)
    try:
        all_returns_df['raw_factor'] = pd.to_numeric(all_returns_df['raw_factor'])
        all_returns_df['risk_adj_factor'] = pd.to_numeric(all_returns_df['risk_adj_factor'])
        all_returns_df['penalty_factor'] = pd.to_numeric(all_returns_df['penalty_factor'])
        all_returns_df['portfolio_return'] = pd.to_numeric(all_returns_df['portfolio_return'])
    except Exception as e:
        print(f"Error converting factor/return columns to numeric: {e}")
        print("Analysis might be affected.")


    # Sort for easier viewing (optional, but good practice)
    grouping_factors = ['raw_factor', 'risk_adj_factor', 'penalty_factor']
    all_returns_df = all_returns_df.sort_values(by=['generation_date'] + grouping_factors)

    print("\n--- Combined Portfolio Return Analysis (All Historical Data) ---")
    print(f"Total return instances analyzed: {len(all_returns_df)}")
    # print("Sample of combined data:") # Optional: Print head if needed
    # print(all_returns_df.head())

    # --- Group by factor combinations to analyze performance ---
    print("\n--- Performance Analysis by Factor Combination (Based on All Data) ---")

    # Check if grouping factors exist
    if not all(f in all_returns_df.columns for f in grouping_factors):
         print(f"Error: One or more grouping factors {grouping_factors} not found in the combined data columns: {all_returns_df.columns}")
         # Handle error - perhaps exit or skip analysis
         performance_summary = pd.DataFrame() # Create empty df to avoid error later
    else:
        # Group by all three factors and aggregate portfolio returns
        performance_summary = all_returns_df.groupby(grouping_factors)['portfolio_return'].agg(
            mean='mean',
            median='median',
            std=lambda x: x.std(ddof=0) if pd.notna(x).sum() > 1 else np.nan,
            count=lambda x: pd.notna(x).sum(), # Count only valid returns for this combo
            sharpe=lambda x: (x.mean() / (x.std() + 1e-9)) * np.sqrt(252) if pd.notna(x).sum() > 1 and x.std() > 1e-9 else np.nan # Example: Annualized Sharpe (assuming daily returns, zero risk-free)
        ).reset_index() # Reset index to make factors columns again

        # Calculate additional metrics if desired, like total return, win rate etc.
        # Example: Add win rate (percentage of positive return days)
        win_rate = all_returns_df[all_returns_df['portfolio_return'] > 0].groupby(grouping_factors).size() / \
                   all_returns_df.groupby(grouping_factors).size()
        performance_summary = pd.merge(performance_summary, win_rate.rename('win_rate').reset_index(), on=grouping_factors, how='left')
        performance_summary['win_rate'] = performance_summary['win_rate'].fillna(0) # Fill NaN if no trades for a combo

        # Sort by a chosen metric to find the 'best' combinations
        # Sorting by Sharpe ratio is common, or mean return, or a combination
        performance_summary = performance_summary.sort_values(by='mean', ascending=False) # Or sort by 'sharpe'
        print("\nPerformance per Factor Combination (sorted by mean return):")
        with pd.option_context('display.float_format', '{:.4f}'.format, 'display.max_rows', 200):
            print(performance_summary)

        # --- Analyze the number of stocks used ---
        stock_count_summary = all_returns_df.groupby(grouping_factors)['num_stocks_calc'].agg(['mean', 'min', 'max'])
        print("\nAverage/Min/Max Stocks Used in Calculation per Factor Combination (All Data):")
        with pd.option_context('display.float_format', '{:.2f}'.format):
            print(stock_count_summary.sort_values(by='mean', ascending=False))

else:
    print("\nNo portfolio returns available (neither historical nor current) to analyze.")
    performance_summary = pd.DataFrame() # Ensure it exists as empty if no data

# =============================================================================
# 5. Save Results (Append Returns, Overwrite Summary)
# =============================================================================
# --- Save the individual returns (APPEND mode) ---
if portfolio_returns_data: # Only append if there's new data from this run
    current_to_save_df = pd.DataFrame(portfolio_returns_data) # Use the list directly from this run
    try:
        # Check if file exists to determine if header is needed
        write_header = not os.path.exists(OUTPUT_RETURNS_CSV)
        # Append new data
        current_to_save_df.to_csv(OUTPUT_RETURNS_CSV, mode='a', header=write_header, index=False)
        print(f"\nAppended {len(current_to_save_df)} new return records to {OUTPUT_RETURNS_CSV}")
    except Exception as e:
        print(f"\nError appending returns to {OUTPUT_RETURNS_CSV}: {e}")
elif not os.path.exists(OUTPUT_RETURNS_CSV):
     # If no new data AND the file doesn't exist, create an empty file with header
     try:
         pd.DataFrame(columns=portfolio_returns_data[0].keys() if portfolio_returns_data else [ # Get columns if possible
            'portfolio_name', 'generation_date', 'buy_date', 'sell_date',
            'raw_factor', 'risk_adj_factor', 'penalty_factor', 'portfolio_return',
            'num_stocks_initial', 'num_stocks_calc']).to_csv(OUTPUT_RETURNS_CSV, index=False)
         print(f"\nCreated empty returns file with header: {OUTPUT_RETURNS_CSV}")
     except Exception as e:
         print(f"\nError creating empty returns file {OUTPUT_RETURNS_CSV}: {e}")

# --- Save the performance summary (OVERWRITE mode) ---
# This summary is based on ALL data (historical + current), so we overwrite it each time
if 'performance_summary' in locals() and not performance_summary.empty:
    try:
        # No need to reset index if already done after groupby
        performance_summary.to_csv(OUTPUT_SUMMARY_CSV, index=False)
        print(f"Updated performance summary saved to {OUTPUT_SUMMARY_CSV}")
    except Exception as e:
        print(f"\nError saving performance summary to {OUTPUT_SUMMARY_CSV}: {e}")
elif not all_returns_df.empty:
     print(f"Warning: Performance summary was calculated but is empty. Not saving {OUTPUT_SUMMARY_CSV}.")
else:
    print(f"No data to generate performance summary. {OUTPUT_SUMMARY_CSV} not saved or updated.")

print("\n--- Script Finished ---")

In [None]:
# import pandas as pd
# import numpy as np
# import sys # Needed for sys.exit
# import traceback # Import traceback for detailed error printing
# import os # <--- IMPORT OS MODULE

# # --- Parameters ---
# # Column name for adjusted close price in df_OHLCV
# ADJ_CLOSE_COL = 'Adj Close'
# # Potential column names for ticker/symbol if not in index initially
# POTENTIAL_TICKER_COLS = ['Symbol', 'Ticker']
# # Potential column name for date if not in index initially
# POTENTIAL_DATE_COL = 'Date'
# # --- Output Files ---
# OUTPUT_RETURNS_CSV = 'portfolio_returns.csv'
# OUTPUT_SUMMARY_CSV = 'portfolio_factor_performance.csv'


# print("--- Portfolio Performance Calculation Script ---")

# # =============================================================================
# # 1. Data and Index Preparation (NO CHANGES NEEDED HERE)
# # =============================================================================
# print("\nChecking and preparing df_OHLCV index structure...")
# # ... (Keep your existing Section 1 code as is) ...
# ticker_level_name = None
# date_level_name = None
# try:
#     # Check if df_OHLCV already has a suitable MultiIndex
#     if isinstance(df_OHLCV.index, pd.MultiIndex) and len(df_OHLCV.index.levels) >= 2:
#         actual_names = list(df_OHLCV.index.names)
#         print(f"Detected MultiIndex with names: {actual_names}")
#         ticker_level_name = actual_names[0] if actual_names[0] is not None else 0
#         date_level_name = actual_names[1] if actual_names[1] is not None else 1
#         if isinstance(ticker_level_name, int) or isinstance(date_level_name, int):
#              print(f"Warning: Using positional index levels ({ticker_level_name}, {date_level_name}) as names were missing.")
#     else:
#         print("Index is not a MultiIndex or has too few levels. Checking columns...")
#         ticker_col_to_use = None
#         for col in POTENTIAL_TICKER_COLS:
#             if col in df_OHLCV.columns:
#                 ticker_col_to_use = col
#                 break
#         date_col_to_use = POTENTIAL_DATE_COL if POTENTIAL_DATE_COL in df_OHLCV.columns else None
#         if ticker_col_to_use and date_col_to_use:
#             required_cols = [ticker_col_to_use, date_col_to_use]
#             print(f"Attempting to set index using columns: {required_cols}...")
#             # Ensure df_OHLCV exists and is a DataFrame before modification
#             if 'df_OHLCV' in locals() and isinstance(df_OHLCV, pd.DataFrame):
#                  df_OHLCV = df_OHLCV.set_index(required_cols)
#                  ticker_level_name = required_cols[0]
#                  date_level_name = required_cols[1]
#                  print(f"MultiIndex set successfully.")
#             else:
#                  print("ERROR: df_OHLCV not defined or not a DataFrame before setting index.")
#                  sys.exit("Exiting due to data structure issue.")

#         else:
#             missing = []
#             if not ticker_col_to_use: missing.extend(POTENTIAL_TICKER_COLS)
#             if not date_col_to_use: missing.append(POTENTIAL_DATE_COL)
#             print(f"ERROR: Cannot set index. Required columns for Ticker ({POTENTIAL_TICKER_COLS}) or Date ({POTENTIAL_DATE_COL}) not found.")
#             if 'df_OHLCV' in locals() and isinstance(df_OHLCV, pd.DataFrame):
#                  print("df_OHLCV columns:", df_OHLCV.columns)
#             else:
#                  print("df_OHLCV is not defined or not a DataFrame.")
#             sys.exit("Exiting due to incorrect df_OHLCV structure.")

#     print(f"Using '{ticker_level_name}' for ticker level and '{date_level_name}' for date level.")

#     if 'df_OHLCV' not in locals() or not isinstance(df_OHLCV, pd.DataFrame):
#         print("ERROR: df_OHLCV is not defined or not a DataFrame.")
#         sys.exit("Exiting due to missing data.")

#     if ADJ_CLOSE_COL not in df_OHLCV.columns:
#         print(f"ERROR: Required price column '{ADJ_CLOSE_COL}' not found in df_OHLCV columns: {df_OHLCV.columns}")
#         sys.exit("Exiting due to missing price column.")

#     # Convert date level to datetime if it's not already
#     date_level_values = df_OHLCV.index.get_level_values(date_level_name)
#     if not pd.api.types.is_datetime64_any_dtype(date_level_values):
#         print(f"Converting date level '{date_level_name}' to datetime objects...")
#         # Create a new index with the converted dates
#         new_levels = list(df_OHLCV.index.levels)
#         date_level_idx = df_OHLCV.index.names.index(date_level_name) # Find position of date level
#         new_levels[date_level_idx] = pd.to_datetime(date_level_values.unique()) # Convert unique dates
#         # Need to reconstruct the index carefully, preserving levels and codes
#         new_codes = list(df_OHLCV.index.codes)
#         new_index = pd.MultiIndex(levels=new_levels, codes=new_codes, names=df_OHLCV.index.names)
#         df_OHLCV.index = new_index
#         print("Date level converted.")


#     if not df_OHLCV.index.is_monotonic_increasing:
#          print("Sorting df_OHLCV index...")
#          df_OHLCV = df_OHLCV.sort_index()
#          print("Index sorted.")

# except Exception as e:
#     print(f"ERROR: An unexpected error occurred during index check/setup: {e}")
#     traceback.print_exc()
#     sys.exit("Exiting due to index setup error.")

# # --- Extract unique trading dates from the prepared index ---
# try:
#     all_dates_in_data = df_OHLCV.index.get_level_values(date_level_name).unique()
#     # Ensure dates are Timestamps and sorted
#     trading_dates = pd.DatetimeIndex(pd.to_datetime(all_dates_in_data)).sort_values()
#     if not trading_dates.empty:
#          print(f"\nTrading dates extracted using level '{date_level_name}'. Found {len(trading_dates)} unique dates (e.g., {trading_dates[0].date()} to {trading_dates[-1].date()}).")
#     else:
#          print(f"ERROR: No dates found in the index level '{date_level_name}'. Cannot proceed.")
#          sys.exit("Exiting - Cannot determine trading dates.")
# except (KeyError, IndexError) as e:
#     print(f"ERROR: Could not find or access level '{date_level_name}' to extract trading dates: {e}")
#     sys.exit("Exiting - Cannot determine trading dates.")
# except Exception as e:
#     print(f"ERROR: An unexpected error occurred extracting trading dates: {e}")
#     traceback.print_exc()
#     sys.exit("Exiting - Cannot determine trading dates.")


# # =============================================================================
# # 2. Helper Function for Trading Dates (NO CHANGES NEEDED HERE)
# # =============================================================================
# def get_next_trading_date(current_date, sorted_trading_dates):
#     """
#     Finds the first trading date strictly after current_date in a sorted DatetimeIndex.
#     Args:
#         current_date (str, datetime, Timestamp): The date after which to find the next trading date.
#         sorted_trading_dates (pd.DatetimeIndex): A sorted index of available trading dates.
#     Returns:
#         pd.Timestamp or None: The next trading date, or None if no date is found after current_date.
#     """
#     if not isinstance(sorted_trading_dates, pd.DatetimeIndex) or not sorted_trading_dates.is_monotonic_increasing:
#         print("Warning: sorted_trading_dates is not a sorted DatetimeIndex in get_next_trading_date.")
#         # Attempt to sort if possible, otherwise return None
#         try:
#             sorted_trading_dates = pd.DatetimeIndex(sorted_trading_dates).sort_values()
#         except Exception:
#             return None # Cannot proceed if dates aren't sortable Timestamps

#     if sorted_trading_dates.empty:
#         return None

#     current_date = pd.Timestamp(current_date)
#     try:
#         loc = sorted_trading_dates.searchsorted(current_date, side='right')
#         if loc < len(sorted_trading_dates):
#             return sorted_trading_dates[loc]
#         else:
#             return None
#     except Exception as e:
#         print(f"Warning: Error in get_next_trading_date for {current_date}: {e}")
#         return None


# # =============================================================================
# # 3. Portfolio Return Calculation Loop (NO CHANGES NEEDED HERE)
# # =============================================================================
# # Assume all_portfolios is populated correctly before this section
# # Example initialization if needed for testing:
# # all_portfolios = {} # Populate this dict from your portfolio generation logic

# portfolio_returns_data = [] # List to store results for *this run*

# print("\nCalculating portfolio returns for the current run...")

# # Flag to track if the index mismatch warning has been shown
# index_mismatch_warning_shown = False

# # Check if all_portfolios is populated
# if 'all_portfolios' not in locals() or not isinstance(all_portfolios, dict):
#     print("ERROR: 'all_portfolios' dictionary is not defined or not a dictionary.")
#     all_portfolios = {} # Initialize to empty to prevent NameError later, though calculation won't run

# if not all_portfolios:
#     print("Warning: 'all_portfolios' dictionary is empty. No returns to calculate for this run.")
# else:
#     processed_count = 0
#     skipped_count = 0
#     error_count = 0
#     # ... (Keep your existing Section 3 loop code as is) ...
#     for portf_name, portf_data in list(all_portfolios.items()):
#         try:
#             # --- 3.1 Extract Parameters and Holdings ---
#             params = portf_data.get('parameters')
#             selected_stocks_df = portf_data.get('selected_stocks')

#             if not isinstance(params, dict) or not isinstance(selected_stocks_df, pd.DataFrame):
#                 print(f"Warning: Skipping {portf_name} - Invalid structure.")
#                 skipped_count += 1
#                 continue

#             portf_date_str = params.get('date')
#             raw_factor = params.get('raw_score_scale_factor')
#             risk_adj_factor = params.get('risk_adj_score_scale_factor')
#             penalty_factor = params.get('penalty_IntraCluster_Corr')

#             if portf_date_str is None or raw_factor is None or risk_adj_factor is None or penalty_factor is None:
#                 print(f"Warning: Skipping {portf_name} - Missing essential parameter.")
#                 skipped_count += 1
#                 continue

#             portf_date = pd.Timestamp(portf_date_str)

#             if selected_stocks_df.empty:
#                 skipped_count += 1
#                 continue

#             # --- 3.2 Ensure Portfolio Index Matches Data Index ---
#             if selected_stocks_df.index.name != ticker_level_name:
#                 if not index_mismatch_warning_shown:
#                     print(f"Warning: Portfolio index name ('{selected_stocks_df.index.name}') mismatches data index ('{ticker_level_name}'). Renaming.")
#                     index_mismatch_warning_shown = True
#                 try:
#                     selected_stocks_df = selected_stocks_df.copy()
#                     selected_stocks_df.index.name = ticker_level_name
#                 except Exception as rename_e:
#                     print(f"Error: Failed to rename portfolio index for {portf_name}: {rename_e}. Skipping.")
#                     skipped_count += 1
#                     continue

#             original_portfolio_tickers = selected_stocks_df.index.tolist()

#             # --- 3.3 Determine Buy and Sell Dates ---
#             buy_date = get_next_trading_date(portf_date, trading_dates)
#             if buy_date is None:
#                 skipped_count += 1
#                 continue

#             sell_date = get_next_trading_date(buy_date, trading_dates)
#             if sell_date is None:
#                 skipped_count += 1
#                 continue

#             required_dates = [buy_date, sell_date]

#             # --- 3.4 Fetch Price Data & Filter Available Tickers ---
#             idx = pd.IndexSlice
#             potential_data = df_OHLCV.loc[
#                 idx[original_portfolio_tickers, required_dates], : # Slice using index directly
#             ]

#             ticker_date_counts = potential_data.index.get_level_values(ticker_level_name).value_counts()
#             available_tickers = ticker_date_counts[ticker_date_counts == len(required_dates)].index.tolist()

#             if not available_tickers:
#                 skipped_count += 1
#                 continue

#             # --- 3.5 Process Prices for Available Tickers ---
#             try:
#                 idx_slice = pd.IndexSlice[available_tickers, required_dates]
#                 prices = potential_data.loc[idx_slice, ADJ_CLOSE_COL]
#                 prices_unstacked = prices.unstack(level=date_level_name)
#                 prices_unstacked = prices_unstacked.rename(columns={buy_date: 'Buy Price', sell_date: 'Sell Price'})
#                 prices_unstacked = prices_unstacked.dropna()

#                 if prices_unstacked.empty:
#                     skipped_count += 1
#                     continue

#                 final_available_tickers = prices_unstacked.index.tolist()

#             except KeyError as e:
#                  print(f"Error: KeyError during price processing for {portf_name}: {e}. Skipping.")
#                  skipped_count += 1
#                  continue
#             except Exception as e:
#                  print(f"Error: Unexpected error during price processing for {portf_name}: {e}. Skipping.")
#                  traceback.print_exc()
#                  skipped_count += 1
#                  continue

#             # --- 3.6 Calculate Portfolio Return ---
#             aligned_weights = selected_stocks_df.loc[final_available_tickers, 'Weight']
#             weight_sum = aligned_weights.sum()
#             if weight_sum <= 1e-9:
#                  skipped_count += 1
#                  continue
#             normalized_weights = aligned_weights / weight_sum

#             buy_prices = prices_unstacked.loc[final_available_tickers, 'Buy Price']
#             sell_prices = prices_unstacked.loc[final_available_tickers, 'Sell Price']

#             zero_price_mask = (buy_prices.abs() < 1e-9)
#             if zero_price_mask.any():
#                 num_zero = zero_price_mask.sum()
#                 print(f"Warning: Found {num_zero} stock(s) with near-zero buy price in {portf_name} on {buy_date.date()}. Excluding.")
#                 valid_price_mask = ~zero_price_mask
#                 buy_prices = buy_prices[valid_price_mask]
#                 sell_prices = sell_prices[valid_price_mask]
#                 normalized_weights = normalized_weights.loc[valid_price_mask]

#                 if buy_prices.empty:
#                      skipped_count += 1
#                      continue
#                 weight_sum = normalized_weights.sum()
#                 if weight_sum <= 1e-9:
#                     skipped_count += 1
#                     continue
#                 normalized_weights = normalized_weights / weight_sum

#             if buy_prices.isnull().any() or sell_prices.isnull().any() or normalized_weights.isnull().any() or \
#                np.isinf(buy_prices).any() or np.isinf(sell_prices).any() or np.isinf(normalized_weights).any():
#                 print(f"Warning: Skipping {portf_name} - Found NaN or Inf in final arrays.")
#                 skipped_count += 1
#                 continue

#             individual_returns = (sell_prices / (buy_prices + 1e-12)) - 1
#             portfolio_return = (individual_returns * normalized_weights).sum()

#             # --- 3.7 Store Results ---
#             portfolio_returns_data.append({
#                 'portfolio_name': portf_name,
#                 'generation_date': portf_date,
#                 'buy_date': buy_date,
#                 'sell_date': sell_date,
#                 'raw_factor': raw_factor,
#                 'risk_adj_factor': risk_adj_factor,
#                 'penalty_factor': penalty_factor,
#                 'portfolio_return': portfolio_return,
#                 'num_stocks_initial': len(original_portfolio_tickers),
#                 'num_stocks_calc': len(final_available_tickers)
#             })
#             processed_count += 1

#         except Exception as e:
#             print(f"FATAL Error processing portfolio {portf_name}: {e}")
#             traceback.print_exc()
#             error_count += 1

#     print(f"\nCurrent run portfolio return calculation finished.")
#     print(f"Successfully processed: {processed_count}")
#     print(f"Skipped (missing data/criteria): {skipped_count}")
#     print(f"Errors during processing: {error_count}")

# # =============================================================================
# # 4. Load Historical Data, Combine, and Analyze
# # =============================================================================
# print("\n--- Loading Historical Data and Performing Combined Analysis ---")

# all_returns_df = pd.DataFrame() # Initialize an empty DataFrame

# # --- Load existing historical data ---
# if os.path.exists(OUTPUT_RETURNS_CSV):
#     try:
#         print(f"Loading historical data from {OUTPUT_RETURNS_CSV}...")
#         historical_returns_df = pd.read_csv(OUTPUT_RETURNS_CSV, parse_dates=['generation_date', 'buy_date', 'sell_date'])
#         # Optional: Add checks for expected columns
#         expected_cols = ['generation_date', 'buy_date', 'sell_date', 'raw_factor', 'risk_adj_factor', 'penalty_factor', 'portfolio_return']
#         if all(col in historical_returns_df.columns for col in expected_cols):
#             all_returns_df = historical_returns_df
#             print(f"Loaded {len(all_returns_df)} historical records.")
#         else:
#             print(f"Warning: Historical data file {OUTPUT_RETURNS_CSV} is missing expected columns. Ignoring historical data.")
#             # Decide if you want to rename/backup the bad file
#             # os.rename(OUTPUT_RETURNS_CSV, OUTPUT_RETURNS_CSV + ".bad_format")

#     except pd.errors.EmptyDataError:
#         print(f"Historical data file {OUTPUT_RETURNS_CSV} is empty. Starting fresh.")
#     except Exception as e:
#         print(f"Error loading historical data from {OUTPUT_RETURNS_CSV}: {e}")
#         print("Warning: Proceeding without historical data.")
# else:
#     print(f"Historical data file {OUTPUT_RETURNS_CSV} not found. Starting fresh.")

# # --- Combine with current run's data ---
# if portfolio_returns_data:
#     current_returns_df = pd.DataFrame(portfolio_returns_data)
#     print(f"Combining {len(current_returns_df)} new records with historical data.")

#     # Ensure date columns are datetime objects in the new data
#     for col in ['generation_date', 'buy_date', 'sell_date']:
#          if col in current_returns_df.columns:
#               current_returns_df[col] = pd.to_datetime(current_returns_df[col])

#     # Concatenate old and new data
#     if not all_returns_df.empty:
#         all_returns_df = pd.concat([all_returns_df, current_returns_df], ignore_index=True)
#     else:
#         all_returns_df = current_returns_df

#     # Optional: Remove potential duplicates based on key identifiers if runs might overlap accidentally
#     key_cols = ['generation_date', 'buy_date', 'sell_date', 'raw_factor', 'risk_adj_factor', 'penalty_factor']
#     initial_len = len(all_returns_df)
#     all_returns_df = all_returns_df.drop_duplicates(subset=key_cols, keep='last')
#     if len(all_returns_df) < initial_len:
#         print(f"Removed {initial_len - len(all_returns_df)} duplicate records based on key columns.")

# else:
#     print("No new portfolio returns were calculated in this run.")

# # --- Perform analysis ONLY if there's data ---
# if not all_returns_df.empty:
#     # Ensure correct data types before grouping (especially factors)
#     try:
#         all_returns_df['raw_factor'] = pd.to_numeric(all_returns_df['raw_factor'])
#         all_returns_df['risk_adj_factor'] = pd.to_numeric(all_returns_df['risk_adj_factor'])
#         all_returns_df['penalty_factor'] = pd.to_numeric(all_returns_df['penalty_factor'])
#         all_returns_df['portfolio_return'] = pd.to_numeric(all_returns_df['portfolio_return'])
#     except Exception as e:
#         print(f"Error converting factor/return columns to numeric: {e}")
#         print("Analysis might be affected.")


#     # Sort for easier viewing (optional, but good practice)
#     grouping_factors = ['raw_factor', 'risk_adj_factor', 'penalty_factor']
#     all_returns_df = all_returns_df.sort_values(by=['generation_date'] + grouping_factors)

#     print("\n--- Combined Portfolio Return Analysis (All Historical Data) ---")
#     print(f"Total return instances analyzed: {len(all_returns_df)}")
#     # print("Sample of combined data:") # Optional: Print head if needed
#     # print(all_returns_df.head())

#     # --- Group by factor combinations to analyze performance ---
#     print("\n--- Performance Analysis by Factor Combination (Based on All Data) ---")

#     # Check if grouping factors exist
#     if not all(f in all_returns_df.columns for f in grouping_factors):
#          print(f"Error: One or more grouping factors {grouping_factors} not found in the combined data columns: {all_returns_df.columns}")
#          # Handle error - perhaps exit or skip analysis
#          performance_summary = pd.DataFrame() # Create empty df to avoid error later
#     else:
#         # Group by all three factors and aggregate portfolio returns
#         performance_summary = all_returns_df.groupby(grouping_factors)['portfolio_return'].agg(
#             mean='mean',
#             median='median',
#             std=lambda x: x.std(ddof=0) if pd.notna(x).sum() > 1 else np.nan,
#             count=lambda x: pd.notna(x).sum(), # Count only valid returns for this combo
#             sharpe=lambda x: (x.mean() / (x.std() + 1e-9)) * np.sqrt(252) if pd.notna(x).sum() > 1 and x.std() > 1e-9 else np.nan # Example: Annualized Sharpe (assuming daily returns, zero risk-free)
#         ).reset_index() # Reset index to make factors columns again

#         # Calculate additional metrics if desired, like total return, win rate etc.
#         # Example: Add win rate (percentage of positive return days)
#         win_rate = all_returns_df[all_returns_df['portfolio_return'] > 0].groupby(grouping_factors).size() / \
#                    all_returns_df.groupby(grouping_factors).size()
#         performance_summary = pd.merge(performance_summary, win_rate.rename('win_rate').reset_index(), on=grouping_factors, how='left')
#         performance_summary['win_rate'] = performance_summary['win_rate'].fillna(0) # Fill NaN if no trades for a combo

#         # Sort by a chosen metric to find the 'best' combinations
#         # Sorting by Sharpe ratio is common, or mean return, or a combination
#         performance_summary = performance_summary.sort_values(by='mean', ascending=False) # Or sort by 'sharpe'
#         print("\nPerformance per Factor Combination (sorted by mean return):")
#         with pd.option_context('display.float_format', '{:.4f}'.format, 'display.max_rows', 200):
#             print(performance_summary)

#         # --- Analyze the number of stocks used ---
#         stock_count_summary = all_returns_df.groupby(grouping_factors)['num_stocks_calc'].agg(['mean', 'min', 'max'])
#         print("\nAverage/Min/Max Stocks Used in Calculation per Factor Combination (All Data):")
#         with pd.option_context('display.float_format', '{:.2f}'.format):
#             print(stock_count_summary.sort_values(by='mean', ascending=False))

# else:
#     print("\nNo portfolio returns available (neither historical nor current) to analyze.")
#     performance_summary = pd.DataFrame() # Ensure it exists as empty if no data

# # =============================================================================
# # 5. Save Results (Append Returns, Overwrite Summary)
# # =============================================================================

# # --- Save the individual returns (APPEND mode) ---
# if portfolio_returns_data: # Only append if there's new data from this run
#     current_to_save_df = pd.DataFrame(portfolio_returns_data) # Use the list directly from this run
#     try:
#         # Check if file exists to determine if header is needed
#         write_header = not os.path.exists(OUTPUT_RETURNS_CSV)
#         # Append new data
#         current_to_save_df.to_csv(OUTPUT_RETURNS_CSV, mode='a', header=write_header, index=False)
#         print(f"\nAppended {len(current_to_save_df)} new return records to {OUTPUT_RETURNS_CSV}")
#     except Exception as e:
#         print(f"\nError appending returns to {OUTPUT_RETURNS_CSV}: {e}")
# elif not os.path.exists(OUTPUT_RETURNS_CSV):
#      # If no new data AND the file doesn't exist, create an empty file with header
#      try:
#          pd.DataFrame(columns=portfolio_returns_data[0].keys() if portfolio_returns_data else [ # Get columns if possible
#             'portfolio_name', 'generation_date', 'buy_date', 'sell_date',
#             'raw_factor', 'risk_adj_factor', 'penalty_factor', 'portfolio_return',
#             'num_stocks_initial', 'num_stocks_calc']).to_csv(OUTPUT_RETURNS_CSV, index=False)
#          print(f"\nCreated empty returns file with header: {OUTPUT_RETURNS_CSV}")
#      except Exception as e:
#          print(f"\nError creating empty returns file {OUTPUT_RETURNS_CSV}: {e}")


# # --- Save the performance summary (OVERWRITE mode) ---
# # This summary is based on ALL data (historical + current), so we overwrite it each time
# if 'performance_summary' in locals() and not performance_summary.empty:
#     try:
#         # No need to reset index if already done after groupby
#         performance_summary.to_csv(OUTPUT_SUMMARY_CSV, index=False)
#         print(f"Updated performance summary saved to {OUTPUT_SUMMARY_CSV}")
#     except Exception as e:
#         print(f"\nError saving performance summary to {OUTPUT_SUMMARY_CSV}: {e}")
# elif not all_returns_df.empty:
#      print(f"Warning: Performance summary was calculated but is empty. Not saving {OUTPUT_SUMMARY_CSV}.")
# else:
#     print(f"No data to generate performance summary. {OUTPUT_SUMMARY_CSV} not saved or updated.")


# print("\n--- Script Finished ---")

In [None]:
# # =============================================================================
# # 4b. Time-Based Performance Analysis (Example: By Year)
# # =============================================================================
# if not all_returns_df.empty:
#     print("\n--- Performance Analysis by Factor Combination AND Year ---")

#     # Ensure 'sell_date' is datetime and extract the year
#     try:
#         all_returns_df['sell_year'] = pd.to_datetime(all_returns_df['sell_date']).dt.year
        
#         # Define grouping factors including the year
#         time_grouping_factors = ['sell_year'] + grouping_factors # grouping_factors was ['raw_factor', 'risk_adj_factor', 'penalty_factor']

#         # Group by year and factors
#         performance_by_year = all_returns_df.groupby(time_grouping_factors)['portfolio_return'].agg(
#             mean='mean',
#             median='median',
#             std=lambda x: x.std(ddof=0) if pd.notna(x).sum() > 1 else np.nan,
#             count=lambda x: pd.notna(x).sum()
#             # Add other metrics like Sharpe, win rate per year if desired
#         ).reset_index()

#         # Sort for better readability (e.g., by year, then by mean return)
#         performance_by_year = performance_by_year.sort_values(by=['sell_year', 'mean'], ascending=[True, False])

#         print("\nPerformance per Factor Combination per Year (sorted by year, then mean return):")
#         with pd.option_context('display.float_format', '{:.4f}'.format, 'display.max_rows', 500): # Show more rows
#             print(performance_by_year)

#         # You could save this to a separate CSV if needed
#         # performance_by_year.to_csv('portfolio_factor_performance_by_year.csv', index=False)

#     except Exception as e:
#         print(f"Error during yearly performance analysis: {e}")
#         traceback.print_exc()
# else:
#     print("\nNo data available for yearly performance analysis.")