In [46]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# # --- Execute the processor ---
# import utils
# from config import date_str, DOWNLOAD_DIR, DEST_DIR

# print(f'date_str: {date_str}')



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\python311.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\Lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src']


In [47]:
date_str = '2025-04-14'

In [48]:
import pandas as pd
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 1000)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell
pd.set_option('display.max_rows', 200)
# pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.4f}'.format)

In [49]:
zscore_df = pd.read_parquet(f'..\data\{date_str}_zscore_df.parquet', engine='pyarrow')
cluster_stats_df = pd.read_parquet(f'..\data\{date_str}_cluster_stats_df.parquet', engine='pyarrow')
detailed_clusters_df = pd.read_parquet(f'..\data\{date_str}_detailed_clusters_df.parquet', engine='pyarrow')
df_data = pd.read_parquet(f'..\data\{date_str}_df_finviz_merged.parquet', engine='pyarrow')

In [50]:
detailed_clusters_df

Unnamed: 0,Cluster_ID,Ticker,Raw_Score,Risk_Adj_Score,Volatility,Price,"MktCap AUM, M"
275,1,BOXX,0.3607,1446.0926,0.0002,111.6800,5760.0000
283,1,ARKB,0.3508,9.2181,0.0381,84.7500,3950.0000
281,1,FBTC,0.3530,9.1264,0.0387,74.2100,16430.0000
286,1,BITB,0.3492,9.1251,0.0383,46.2200,3140.0000
287,1,IBIT,0.3477,8.9469,0.0389,48.2800,47840.0000
...,...,...,...,...,...,...,...
23,60,NOC,1.6317,79.0729,0.0206,535.8200,77230.0000
59,60,LMT,1.2704,62.4901,0.0203,475.3400,111500.0000
101,60,CVS,0.9378,39.9583,0.0235,69.2000,87360.0000
882,60,GRAB,-0.4144,-6.9769,0.0594,3.9800,16220.0000


In [51]:
import pandas as pd # Assuming pandas is used

def select_stocks_from_clusters(cluster_stats_df, detailed_clusters_df,
                                select_top_n_clusters=3, max_selection_per_cluster=5,
                                min_cluster_size=5, penalty_IntraCluster_Corr=0.3,
                                date_str=date_str,
                                min_raw_score=None, # <-- Added argument
                                min_risk_adj_score=None): # <-- Added argument
    """
    Pipeline to select stocks from better performing clusters, with optional score thresholds.

    Parameters:
    - cluster_stats_df: DataFrame with cluster statistics.
    - detailed_clusters_df: DataFrame with detailed cluster information including
                            'Ticker', 'Cluster_ID', 'Raw_Score', 'Risk_Adj_Score', etc.
    - select_top_n_clusters: int, Number of top clusters to select (default=3).
    - max_selection_per_cluster: int, Max number of stocks to select from each cluster (default=5).
    - min_cluster_size: int, Minimum size for a cluster to be considered (default=5).
    - penalty_IntraCluster_Corr: float, Penalty weight for intra-cluster correlation in
                                     composite score (default=0.3).
    - date_str: str, Date string for tracking/parameter storage.
    - min_raw_score: float, optional (default=None)
        Minimum Raw_Score required for a stock to be considered for selection.
        If None, no threshold is applied based on Raw_Score.
    - min_risk_adj_score: float, optional (default=None)
        Minimum Risk_Adj_Score required for a stock to be considered for selection.
        If None, no threshold is applied based on Risk_Adj_Score.

    Returns:
    - dict: A dictionary containing:
        - 'selected_top_n_cluster_ids': List of top selected cluster IDs.
        - 'selected_stocks': DataFrame of selected stocks.
        - 'cluster_performance': DataFrame of selected cluster metrics.
        - 'parameters': Dictionary of the input parameters used.
    """

    # Store input parameters
    parameters = {
        'date_str': date_str,
        'select_top_n_clusters': select_top_n_clusters,
        'max_selection_per_cluster': max_selection_per_cluster,
        'min_cluster_size': min_cluster_size,
        'min_raw_score': min_raw_score,         # <-- Stored parameter
        'min_risk_adj_score': min_risk_adj_score, # <-- Stored parameter
        'penalty_IntraCluster_Corr': penalty_IntraCluster_Corr,
    }
    
    # ===== 1. Filter and Rank Clusters =====
    qualified_clusters = cluster_stats_df[cluster_stats_df['Size'] >= min_cluster_size].copy()
    if qualified_clusters.empty:
        print(f"Warning: No clusters met the minimum size criteria ({min_cluster_size}).")
        return {
            'selected_stocks': pd.DataFrame(),
            'cluster_performance': pd.DataFrame(),
            'parameters': parameters
        }

    qualified_clusters['Composite_Cluster_Score'] = (
        (1 - penalty_IntraCluster_Corr) * qualified_clusters['Avg_Raw_Score'] +
        penalty_IntraCluster_Corr * (1 - qualified_clusters['Avg_IntraCluster_Corr'])
    )
    ranked_clusters = qualified_clusters.sort_values('Composite_Cluster_Score', ascending=False)
    selected_clusters = ranked_clusters.head(select_top_n_clusters)
    cluster_ids = selected_clusters['Cluster_ID'].tolist()

    if not cluster_ids:
        print("Warning: No clusters were selected based on ranking.")
        return {
            'selected_stocks': pd.DataFrame(),
            'cluster_performance': selected_clusters, # Return empty selected clusters df
            'parameters': parameters
        }


    # ===== 2. Select Stocks from Each Cluster =====
    selected_stocks_list = []
    for cluster_id in cluster_ids:
        # Get all stocks for the current cluster
        cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id].copy()

        # ===> Apply Threshold Filters <===
        if min_raw_score is not None:
            cluster_stocks = cluster_stocks[cluster_stocks['Raw_Score'] >= min_raw_score]
        if min_risk_adj_score is not None:
            cluster_stocks = cluster_stocks[cluster_stocks['Risk_Adj_Score'] >= min_risk_adj_score]
        # ===> End of Added Filters <===

        # Proceed only if stocks remain after filtering
        if len(cluster_stocks) > 0:
            # Sort remaining stocks by Risk_Adj_Score and select top N
            top_stocks = cluster_stocks.sort_values('Risk_Adj_Score', ascending=False).head(max_selection_per_cluster)

            # Add cluster-level metrics to the selected stock rows
            cluster_metrics = selected_clusters[selected_clusters['Cluster_ID'] == cluster_id].iloc[0]
            for col in ['Composite_Cluster_Score', 'Avg_IntraCluster_Corr', 'Avg_Volatility',
                      'Avg_Raw_Score', 'Avg_Risk_Adj_Score', 'Size']: # Added Size for context
                # Use .get() for safety if a column might be missing
                top_stocks[f'Cluster_{col}'] = cluster_metrics.get(col, None)
            selected_stocks_list.append(top_stocks)

    # Consolidate selected stocks
    if selected_stocks_list:
        selected_stocks = pd.concat(selected_stocks_list)
        # Recalculate weights based on the final selection
        if selected_stocks['Risk_Adj_Score'].sum() != 0:
             selected_stocks['Weight'] = (selected_stocks['Risk_Adj_Score'] /
                                          selected_stocks['Risk_Adj_Score'].sum())
        else:
             # Handle case where all selected scores are zero (unlikely but possible)
             selected_stocks['Weight'] = 1 / len(selected_stocks) if len(selected_stocks) > 0 else 0

        selected_stocks = selected_stocks.sort_values(['Cluster_ID', 'Risk_Adj_Score'],
                                                    ascending=[True, False])
    else:
        selected_stocks = pd.DataFrame()
        print("Warning: No stocks met selection criteria (including score thresholds if applied).")


    # ===== 3. Prepare Enhanced Output Reports =====
    cluster_performance = selected_clusters.copy()
    # Calculate how many stocks were actually selected per cluster after filtering
    cluster_performance['Stocks_Selected'] = cluster_performance['Cluster_ID'].apply(
        lambda x: len(selected_stocks[selected_stocks['Cluster_ID'] == x]) if not selected_stocks.empty else 0)

    if not selected_stocks.empty:
         # Ensure Avg_IntraCluster_Corr exists before calculating diversification
        if 'Avg_IntraCluster_Corr' in cluster_performance.columns:
             cluster_performance['Intra_Cluster_Diversification'] = 1 - cluster_performance['Avg_IntraCluster_Corr']
        else:
             cluster_performance['Intra_Cluster_Diversification'] = pd.NA # Or None
    else:
      # Handle case where selected_stocks is empty
        cluster_performance['Intra_Cluster_Diversification'] = pd.NA # Or None

    # ===> Package results and parameters
    results_bundle = {
        'selected_top_n_cluster_ids': cluster_ids,
        'selected_stocks': selected_stocks,
        'cluster_performance': cluster_performance,
        'parameters': parameters
    }

    return results_bundle


In [52]:
import pandas as pd
from typing import Dict, Any

def print_stock_selection_report(output: Dict[str, Any]) -> None:
    """
    Prints a detailed report summarizing the results of the stock selection process,
    extracting all necessary information from the output dictionary.

    Args:
        output (Dict[str, Any]): The dictionary returned by the
                                 select_stocks_from_clusters function, containing:
                                 - 'selected_stocks': DataFrame of selected stocks.
                                 - 'cluster_performance': DataFrame of selected cluster metrics.
                                #  - 'parameters': Dictionary of the input parameters used.
                                #  - 'cluster_stats_df': Original cluster stats DataFrame.
                                #  - 'detailed_clusters_df': Original detailed clusters DataFrame.
    Returns:
        None: This function prints output to the console.
    """
    # Extract data from the output dictionary using .get() for safety
    selected_stocks = output.get('selected_stocks', pd.DataFrame())
    cluster_performance = output.get('cluster_performance', pd.DataFrame())
    used_params = output.get('parameters', {})
    # Extract the input DataFrames needed for the report
    # cluster_stats_df = output.get('input_cluster_stats_df') # Might be None
    cluster_stats_df = output.get('cluster_stats_df') # Might be None
    # detailed_clusters_df = output.get('input_detailed_clusters_df') # Might be None
    detailed_clusters_df = output.get('detailed_clusters_df') # Might be None

    # --- Start of Original Code Block (adapted) ---

    print("\n=== CLUSTER SELECTION CRITERIA ===")
    print("* Using Composite_Cluster_Score (balancing Raw Score and diversification) for cluster ranking.")
    print("* Using Risk_Adj_Score for stock selection within clusters.")

    num_selected_clusters = len(cluster_performance) if not cluster_performance.empty else 0
    # Use the extracted cluster_stats_df
    total_clusters = len(cluster_stats_df) if cluster_stats_df is not None and not cluster_stats_df.empty else 'N/A'

    print(f"* Selected top {num_selected_clusters} clusters from {total_clusters} total initial clusters.") # Adjusted wording slightly
    print(f"* Selection Criteria:")
    if used_params:
        for key, value in used_params.items():
            # Avoid printing the large input dataframes stored in parameters if they were added there too
            if not isinstance(value, pd.DataFrame):
                 print(f"    {key}: {value}")
    else:
        print("    Parameters not available.")


    if not cluster_performance.empty:
        print("\n=== SELECTED CLUSTERS (RANKED BY COMPOSITE SCORE) ===")
        display_cols_exist = [col for col in [
                                'Cluster_ID', 'Size', 'Avg_Raw_Score', 'Avg_Risk_Adj_Score',
                                'Avg_IntraCluster_Corr', 'Avg_Volatility', 'Composite_Cluster_Score',
                                'Stocks_Selected', 'Intra_Cluster_Diversification']
                              if col in cluster_performance.columns]
        print(cluster_performance[display_cols_exist].sort_values('Composite_Cluster_Score', ascending=False).to_string(index=False))

        # Print top 8 stocks by Raw_Score for each selected cluster
        # Check if detailed_clusters_df was successfully extracted
        if detailed_clusters_df is not None and not detailed_clusters_df.empty:
            print("\n=== TOP STOCKS BY RAW SCORE PER SELECTED CLUSTER ===")
            print("""* Volatility is the standard deviation of daily returns over the past 250 trading days (example context).
* Note: The stocks below are shown ranked by Raw_Score for analysis,
*       but actual selection within the cluster was based on Risk_Adj_Score.""")

            for cluster_id in cluster_performance['Cluster_ID']:
                 cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id]
                 if not cluster_stocks.empty:
                    required_cols = ['Ticker', 'Raw_Score', 'Risk_Adj_Score', 'Volatility']
                    if all(col in cluster_stocks.columns for col in required_cols):
                        top_raw = cluster_stocks.nlargest(8, 'Raw_Score')[required_cols]

                        print(f"\nCluster {cluster_id} - Top 8 by Raw Score:")
                        print(top_raw.to_string(index=False))
                        cluster_avg_raw = cluster_performance.loc[cluster_performance['Cluster_ID'] == cluster_id, 'Avg_Raw_Score'].values
                        cluster_avg_risk = cluster_performance.loc[cluster_performance['Cluster_ID'] == cluster_id, 'Avg_Risk_Adj_Score'].values
                        if len(cluster_avg_raw) > 0: print(f"Cluster Avg Raw Score: {cluster_avg_raw[0]:.2f}")
                        if len(cluster_avg_risk) > 0: print(f"Cluster Avg Risk Adj Score: {cluster_avg_risk[0]:.2f}")
                    else:
                        print(f"\nCluster {cluster_id} - Missing required columns in detailed_clusters_df to show top stocks.")
                 else:
                     print(f"\nCluster {cluster_id} - No stocks found in detailed_clusters_df for this cluster.")
        else:
            print("\n=== TOP STOCKS BY RAW SCORE PER SELECTED CLUSTER ===")
            print("Skipping - Detailed cluster information ('input_detailed_clusters_df') not found in the output dictionary.")

    else:
        print("\n=== SELECTED CLUSTERS ===")
        print("No clusters were selected based on the criteria.")


    print(f"\n=== FINAL SELECTED STOCKS (FILTERED & WEIGHTED) ===")
    if not selected_stocks.empty:
        print("* Stocks actually selected based on Risk_Adj_Score (and optional thresholds) within each cluster.")
        print("* Position weights assigned based on Risk_Adj_Score within the final selected portfolio.")

        desired_cols = ['Cluster_ID', 'Ticker', 'Raw_Score', 'Risk_Adj_Score',
                        'Volatility', 'Weight',
                        'Cluster_Avg_Raw_Score', 'Cluster_Avg_Risk_Adj_Score']
        available_cols = [col for col in desired_cols if col in selected_stocks.columns]
        print(selected_stocks[available_cols].sort_values(['Cluster_ID', 'Risk_Adj_Score'],
                                                        ascending=[True, False]).to_string(index=False))

        print("\n=== PORTFOLIO SUMMARY ===")
        print(f"Total Stocks Selected: {len(selected_stocks)}")
        print(f"Average Raw Score: {selected_stocks.get('Raw_Score', pd.Series(dtype=float)).mean():.2f}")
        print(f"Average Risk-Adjusted Score: {selected_stocks.get('Risk_Adj_Score', pd.Series(dtype=float)).mean():.2f}")
        print(f"Average Volatility: {selected_stocks.get('Volatility', pd.Series(dtype=float)).mean():.2f}")
        print(f"Total Weight (should be close to 1.0): {selected_stocks.get('Weight', pd.Series(dtype=float)).sum():.4f}")
        print("\nCluster Distribution:")
        print(selected_stocks['Cluster_ID'].value_counts().to_string())
    else:
        print("No stocks were selected after applying all filters and criteria.")


In [53]:
import pandas as pd
import numpy as np
import itertools # Import the itertools module

# --- Define Factor Ranges ---
# Generate the factors using numpy.arange for float steps
# Add a small epsilon to the end value to ensure the endpoint is included due to float precision
raw_score_factors = np.arange(0.5, 1.2 + 0.01, 0.1)
risk_adj_score_factors = np.arange(0.5, 1.2 + 0.01, 0.1)
penalty_factors = np.arange(0, 0.4 + 0.01, 0.1) # New factor range

print("--- Parameter Ranges ---")
print(f"Raw Score Factors: {np.round(raw_score_factors,1)}")
print(f"Risk Adj Score Factors: {np.round(risk_adj_score_factors,1)}")
print(f"Penalty Factors: {np.round(penalty_factors,1)}")


# --- Generate All Combinations ---
# Use itertools.product to create an iterator of all combinations
parameter_combinations = list(itertools.product(raw_score_factors, risk_adj_score_factors, penalty_factors))
total_combinations = len(parameter_combinations)
print(f"\nTotal parameter combinations to iterate: {total_combinations}")


# --- Store results ---
all_portfolios = {} # Dictionary to store portfolios by name

# --- Fixed Parameters (that don't vary in this loop) ---
select_top_n_clusters = 60
max_selection_per_cluster = 2
min_cluster_size = 3  # prevent extreme high risk adj scores
# You might want to get the date dynamically
portf_date_base = date_str # Example date, adjust as needed (Ensure date_str is defined before this)

# --- Iteration Loop (Single Loop over Combinations) ---
print("\nStarting portfolio generation loop...")
for i, (raw_scale, risk_adj_scale, penalty) in enumerate(parameter_combinations):

    # --- Calculate dynamic parameters based on current factors ---
    # Round factors slightly to avoid potential floating point representation issues in calculations/names
    raw_scale = round(raw_scale, 1)
    risk_adj_scale = round(risk_adj_scale, 1)
    penalty = round(penalty, 1) # Round the new penalty factor as well

    min_raw_score = 2.0 * raw_scale
    min_risk_adj_score = 100.0 * risk_adj_scale
    # penalty_IntraCluster_Corr is now the 'penalty' variable from the combination

    print(f"\nRunning combination {i+1}/{total_combinations}: "
          f"raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}")
    print(f"Resulting thresholds: min_raw_score={min_raw_score:.2f}, min_risk_adj_score={min_risk_adj_score:.1f}")

    # --- Run the selection pipeline ---
    try:
        output = select_stocks_from_clusters(
            cluster_stats_df=cluster_stats_df,      # Ensure this DataFrame is loaded/defined
            detailed_clusters_df=detailed_clusters_df, # Ensure this DataFrame is loaded/defined
            select_top_n_clusters=select_top_n_clusters,
            max_selection_per_cluster=max_selection_per_cluster,
            min_cluster_size=min_cluster_size,
            penalty_IntraCluster_Corr=penalty, # Use the penalty from the current combination
            min_raw_score=min_raw_score,
            min_risk_adj_score=min_risk_adj_score,
            date_str=portf_date_base # Pass the date
        )

        # --- Process and Store Results ---
        portf_date = output['parameters']['date_str']
        portf_raw_score_val = output['parameters']['min_raw_score'] # Use the value returned by the function
        portf_risk_adj_score_val = output['parameters']['min_risk_adj_score'] # Use the value returned by the function
        portf_penalty_val = output['parameters']['penalty_IntraCluster_Corr'] # Get penalty used
        _selected_stocks = output['selected_stocks']

        if _selected_stocks is not None and not _selected_stocks.empty:
            portf_selected_stocks = _selected_stocks.set_index('Ticker')[['Weight']] # Select only Weight column after setting index

            # Format the name using the factors - now includes penalty
            portf_name = f'{portf_date}_portf_rawF_{raw_scale:.1f}_riskAdjF_{risk_adj_scale:.1f}_pen_{penalty:.1f}'
            # Alternative using resulting thresholds and penalty:
            # portf_name = f'{portf_date}_portf_raw_{portf_raw_score_val:.2f}_riskadj_{portf_risk_adj_score_val:.1f}_pen_{portf_penalty_val:.1f}'

            print(f'Generated Portfolio:')
            print(f'  Name: {portf_name}')
            print(f'  Number of stocks: {len(portf_selected_stocks)}')
            # print(f'port_selected_stocks:\n{portf_selected_stocks}') # Can be verbose

            # Store the results - using the portfolio name as the key
            all_portfolios[portf_name] = {
                'parameters': {
                    'raw_score_scale_factor': raw_scale,
                    'risk_adj_score_scale_factor': risk_adj_scale,
                    'penalty_IntraCluster_Corr': penalty, # Store the penalty factor used
                    'min_raw_score': portf_raw_score_val,
                    'min_risk_adj_score': portf_risk_adj_score_val,
                    'select_top_n_clusters': select_top_n_clusters,
                    'max_selection_per_cluster': max_selection_per_cluster,
                    'min_cluster_size': min_cluster_size,
                    'date': portf_date
                },
                'selected_stocks': portf_selected_stocks
            }
        else:
             print(f"No stocks selected for raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}")

    except Exception as e:
        print(f"ERROR processing combination raw_F={raw_scale:.1f}, riskAdj_F={risk_adj_scale:.1f}, penalty={penalty:.1f}: {e}")
        # Decide if you want to continue or stop on error - currently continues

print(f"\n--- Portfolio Generation Complete ---")
print(f"Generated {len(all_portfolios)} portfolios out of {total_combinations} combinations attempted.")



--- Parameter Ranges ---
Raw Score Factors: [0.5 0.6 0.7 0.8 0.9 1.  1.1 1.2]
Risk Adj Score Factors: [0.5 0.6 0.7 0.8 0.9 1.  1.1 1.2]
Penalty Factors: [0.  0.1 0.2 0.3 0.4]

Total parameter combinations to iterate: 320

Starting portfolio generation loop...

Running combination 1/320: raw_F=0.5, riskAdj_F=0.5, penalty=0.0
Resulting thresholds: min_raw_score=1.00, min_risk_adj_score=50.0
Generated Portfolio:
  Name: 2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.0
  Number of stocks: 35

Running combination 2/320: raw_F=0.5, riskAdj_F=0.5, penalty=0.1
Resulting thresholds: min_raw_score=1.00, min_risk_adj_score=50.0
Generated Portfolio:
  Name: 2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.1
  Number of stocks: 35

Running combination 3/320: raw_F=0.5, riskAdj_F=0.5, penalty=0.2
Resulting thresholds: min_raw_score=1.00, min_risk_adj_score=50.0
Generated Portfolio:
  Name: 2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.2
  Number of stocks: 35

Running combination 4/320: raw_F=0.5, riskAdj_

In [54]:
all_portfolios

{'2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.0': {'parameters': {'raw_score_scale_factor': np.float64(0.5),
   'risk_adj_score_scale_factor': np.float64(0.5),
   'penalty_IntraCluster_Corr': np.float64(0.0),
   'min_raw_score': np.float64(1.0),
   'min_risk_adj_score': np.float64(50.0),
   'select_top_n_clusters': 60,
   'max_selection_per_cluster': 2,
   'min_cluster_size': 3,
   'date': '2025-04-14'},
  'selected_stocks':         Weight
  Ticker        
  CPRT    0.0086
  SFM     0.0081
  LHX     0.0079
  IAGG    0.0955
  CNP     0.0169
  XEL     0.0137
  BTI     0.0129
  VZ      0.0116
  WEC     0.0189
  SO      0.0152
  SBS     0.0105
  UL      0.0124
  NGG     0.0110
  ADC     0.0092
  HRB     0.0202
  SAIC    0.0106
  COR     0.0149
  EXC     0.0138
  MOH     0.0118
  TU      0.0100
  ICSH    0.4997
  MDLZ    0.0102
  HDB     0.0107
  HII     0.0085
  BUD     0.0107
  TJX     0.0147
  ROST    0.0126
  EA      0.0098
  KDP     0.0132
  ROL     0.0101
  SMMT    0.0081
  SGOL    0

In [55]:
# df_data = pd.read_parquet(f'..\data\{date_str}_df_finviz_merged.parquet', engine='pyarrow')
df_data = pd.read_parquet(r'..\data\2025-04-17_df_OHLCV_clean.parquet', engine='pyarrow')
df_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,2025-04-17,197.1500,198.8300,194.4200,196.9800,196.9800,50423277,197.1500,198.8300,194.4200
AAPL,2025-04-16,198.3600,200.7000,192.3700,194.2700,194.2700,59732400,198.3600,200.7000,192.3700
AAPL,2025-04-15,201.8600,203.5100,199.8000,202.1400,202.1400,51343900,201.8600,203.5100,199.8000
AAPL,2025-04-14,211.4400,212.9400,201.1600,202.5200,202.5200,101352900,211.4400,212.9400,201.1600
AAPL,2025-04-11,186.1000,199.5400,186.0600,198.1500,198.1500,87435900,186.1000,199.5400,186.0600
...,...,...,...,...,...,...,...,...,...,...
ARGX,2024-04-24,380.8500,380.8500,372.6800,375.0000,375.0000,266400,380.8500,380.8500,372.6800
ARGX,2024-04-23,368.6900,377.9400,368.6900,375.0800,375.0800,340200,368.6900,377.9400,368.6900
ARGX,2024-04-22,362.9300,370.8000,359.8300,368.7500,368.7500,335800,362.9300,370.8000,359.8300
ARGX,2024-04-19,360.3800,362.5400,357.5500,360.2300,360.2300,329100,360.3800,362.5400,357.5500


In [56]:
import pandas as pd
import numpy as np
import sys # Needed for sys.exit
import traceback # Import traceback for detailed error printing


# --- Parameters ---
# Column name for adjusted close price in df_data
ADJ_CLOSE_COL = 'Adj Close'
# Potential column names for ticker/symbol if not in index initially
POTENTIAL_TICKER_COLS = ['Symbol', 'Ticker']
# Potential column name for date if not in index initially
POTENTIAL_DATE_COL = 'Date'

print("--- Portfolio Performance Calculation Script ---")

# =============================================================================
# 1. Data and Index Preparation
# =============================================================================
print("\nChecking and preparing df_data index structure...")

ticker_level_name = None
date_level_name = None

try:
    # Check if df_data already has a suitable MultiIndex
    if isinstance(df_data.index, pd.MultiIndex) and len(df_data.index.levels) >= 2:
        actual_names = list(df_data.index.names)
        print(f"Detected MultiIndex with names: {actual_names}")
        # Assign discovered names (handle unnamed levels using position)
        ticker_level_name = actual_names[0] if actual_names[0] is not None else 0
        date_level_name = actual_names[1] if actual_names[1] is not None else 1

        if isinstance(ticker_level_name, int) or isinstance(date_level_name, int):
             print(f"Warning: Using positional index levels ({ticker_level_name}, {date_level_name}) as names were missing.")
        # Optional: Add more sophisticated checks if needed (e.g., data types of levels)

    # Otherwise, try to create MultiIndex from columns
    else:
        print("Index is not a MultiIndex or has too few levels. Checking columns...")
        # Find the first matching potential ticker column
        ticker_col_to_use = None
        for col in POTENTIAL_TICKER_COLS:
            if col in df_data.columns:
                ticker_col_to_use = col
                break

        date_col_to_use = POTENTIAL_DATE_COL if POTENTIAL_DATE_COL in df_data.columns else None

        # If both columns are found, set the index
        if ticker_col_to_use and date_col_to_use:
            required_cols = [ticker_col_to_use, date_col_to_use]
            print(f"Attempting to set index using columns: {required_cols}...")
            df_data = df_data.set_index(required_cols)
            ticker_level_name = required_cols[0] # Set names based on columns used
            date_level_name = required_cols[1]
            print(f"MultiIndex set successfully.")
        else:
            # Report missing columns and exit
            missing = []
            if not ticker_col_to_use: missing.extend(POTENTIAL_TICKER_COLS)
            if not date_col_to_use: missing.append(POTENTIAL_DATE_COL)
            print(f"ERROR: Cannot set index. Required columns for Ticker ({POTENTIAL_TICKER_COLS}) or Date ({POTENTIAL_DATE_COL}) not found in df_data.")
            print("df_data columns:", df_data.columns)
            sys.exit("Exiting due to incorrect df_data structure.")

    print(f"Using '{ticker_level_name}' for ticker level and '{date_level_name}' for date level.")

    # Ensure the required price column exists
    if ADJ_CLOSE_COL not in df_data.columns:
        print(f"ERROR: Required price column '{ADJ_CLOSE_COL}' not found in df_data columns: {df_data.columns}")
        sys.exit("Exiting due to missing price column.")

    # Sort index for efficient lookup (essential for slicing performance)
    if not df_data.index.is_monotonic_increasing:
         print("Sorting df_data index...")
         df_data = df_data.sort_index()
         print("Index sorted.")

except Exception as e:
    print(f"ERROR: An unexpected error occurred during index check/setup: {e}")
    traceback.print_exc()
    sys.exit("Exiting due to index setup error.")

# --- Extract unique trading dates from the prepared index ---
try:
    all_dates_in_data = df_data.index.get_level_values(date_level_name).unique()
    # Convert to DatetimeIndex and sort (essential for get_next_trading_date)
    trading_dates = pd.DatetimeIndex(all_dates_in_data).sort_values()
    if not trading_dates.empty:
         print(f"\nTrading dates extracted using level '{date_level_name}'. Found {len(trading_dates)} unique dates (e.g., {trading_dates[0].date()} to {trading_dates[-1].date()}).")
    else:
         print(f"ERROR: No dates found in the index level '{date_level_name}'. Cannot proceed.")
         sys.exit("Exiting - Cannot determine trading dates.")

except (KeyError, IndexError) as e:
    print(f"ERROR: Could not find or access level '{date_level_name}' to extract trading dates: {e}")
    sys.exit("Exiting - Cannot determine trading dates.")
except Exception as e:
    print(f"ERROR: An unexpected error occurred extracting trading dates: {e}")
    traceback.print_exc()
    sys.exit("Exiting - Cannot determine trading dates.")


# =============================================================================
# 2. Helper Function for Trading Dates
# =============================================================================
def get_next_trading_date(current_date, sorted_trading_dates):
    """
    Finds the first trading date strictly after current_date in a sorted DatetimeIndex.

    Args:
        current_date (str, datetime, Timestamp): The date after which to find the next trading date.
        sorted_trading_dates (pd.DatetimeIndex): A sorted index of available trading dates.

    Returns:
        pd.Timestamp or None: The next trading date, or None if no date is found after current_date.
    """
    if not isinstance(sorted_trading_dates, pd.DatetimeIndex) or sorted_trading_dates.empty:
        return None # Should not happen if setup is correct, but good practice
    current_date = pd.Timestamp(current_date)
    try:
        # searchsorted finds the index where current_date would be inserted to maintain order.
        # 'right' ensures we find dates *strictly greater than* current_date.
        loc = sorted_trading_dates.searchsorted(current_date, side='right')
        if loc < len(sorted_trading_dates):
            return sorted_trading_dates[loc]
        else:
            return None # No trading date found after current_date
    except Exception as e:
        # Log error but allow potential continuation if it happens rarely
        print(f"Warning: Error in get_next_trading_date for {current_date}: {e}")
        return None

# =============================================================================
# 3. Portfolio Return Calculation Loop
# =============================================================================
portfolio_returns_data = [] # List to store results for each portfolio instance

print("\nCalculating portfolio returns...")

# Flag to track if the index mismatch warning has been shown
index_mismatch_warning_shown = False  # <---- Initialize flag HERE

if not all_portfolios:
    print("Warning: 'all_portfolios' dictionary is empty. No returns to calculate.")
else:
    processed_count = 0
    skipped_count = 0
    error_count = 0
    # Iterate through each generated portfolio
    # Using list() prevents issues if we were to modify the dict during iteration (though we aren't here)
    for portf_name, portf_data in list(all_portfolios.items()):
        try:
            # --- 3.1 Extract Parameters and Holdings ---
            params = portf_data.get('parameters')
            selected_stocks_df = portf_data.get('selected_stocks')

            # Basic validation of portfolio data structure
            if not isinstance(params, dict) or not isinstance(selected_stocks_df, pd.DataFrame):
                print(f"Warning: Skipping {portf_name} - Invalid structure (missing 'parameters' or 'selected_stocks').")
                skipped_count += 1
                continue

            portf_date_str = params.get('date')
            raw_factor = params.get('raw_score_scale_factor')
            risk_adj_factor = params.get('risk_adj_score_scale_factor')
            penalty_factor = params.get('penalty_IntraCluster_Corr') # Extract penalty factor

            # Check if essential parameters are present
            if portf_date_str is None or raw_factor is None or risk_adj_factor is None or penalty_factor is None:
                print(f"Warning: Skipping {portf_name} - Missing essential parameter (date, factors, or penalty).")
                skipped_count += 1
                continue

            portf_date = pd.Timestamp(portf_date_str)

            # Skip if the portfolio has no stocks selected
            if selected_stocks_df.empty:
                # This might be expected for some parameter combinations, so use info level
                # print(f"Info: Skipping {portf_name} - Portfolio contains no stocks.")
                skipped_count += 1
                continue

            # --- 3.2 Ensure Portfolio Index Matches Data Index ---
            # This ensures we can align weights with prices correctly
            if selected_stocks_df.index.name != ticker_level_name:
                # --- Check if we should show the warning ---
                if not index_mismatch_warning_shown:
                    print(f"Warning: Portfolio index name ('{selected_stocks_df.index.name}') "
                          f"mismatches data index ('{ticker_level_name}'). "
                          f"Renaming will occur for this and subsequent mismatches.")
                    index_mismatch_warning_shown = True # Set flag after showing warning once

                # --- Always perform the renaming if needed ---
                try:
                    selected_stocks_df = selected_stocks_df.copy()
                    selected_stocks_df.index.name = ticker_level_name
                except Exception as rename_e:
                    print(f"Error: Failed to rename portfolio index for {portf_name}: {rename_e}. Skipping.")
                    skipped_count += 1
                    continue # Skip this portfolio if renaming fails

            original_portfolio_tickers = selected_stocks_df.index.tolist()

            # --- 3.3 Determine Buy and Sell Dates ---
            buy_date = get_next_trading_date(portf_date, trading_dates)
            if buy_date is None:
                # print(f"Info: Skipping {portf_name} - No trading date found after generation date {portf_date.date()}.")
                skipped_count += 1
                continue

            sell_date = get_next_trading_date(buy_date, trading_dates)
            if sell_date is None:
                # print(f"Info: Skipping {portf_name} - No trading date found after buy date {buy_date.date()} (end of data?).")
                skipped_count += 1
                continue

            required_dates = [buy_date, sell_date]

            # --- 3.4 Fetch Price Data & Filter Available Tickers ---
            # Efficiently filter df_data for relevant tickers and dates
            potential_data = df_data.loc[
                (df_data.index.get_level_values(ticker_level_name).isin(original_portfolio_tickers)) &
                (df_data.index.get_level_values(date_level_name).isin(required_dates))
            ]

            # Identify tickers that have data for BOTH buy and sell dates
            ticker_date_counts = potential_data.index.get_level_values(ticker_level_name).value_counts()
            # We need exactly 2 dates (buy and sell) for the return calculation
            available_tickers = ticker_date_counts[ticker_date_counts == len(required_dates)].index.tolist()

            # Skip if no tickers have data for both dates
            if not available_tickers:
                # print(f"Info: Skipping {portf_name} - No stocks found with data for BOTH Buy({buy_date.date()}) and Sell({sell_date.date()}) dates.")
                skipped_count += 1
                continue

            # Report if some tickers were dropped due to missing data (optional, can be verbose)
            # dropped_tickers_count = len(original_portfolio_tickers) - len(available_tickers)
            # if dropped_tickers_count > 0:
            #      print(f"Info for {portf_name}: {dropped_tickers_count} ticker(s) dropped due to missing data on required dates.")

            # --- 3.5 Process Prices for Available Tickers ---
            try:
                # Select 'Adj Close' for the available tickers and dates
                idx_slice = pd.IndexSlice[available_tickers, required_dates]
                prices = potential_data.loc[idx_slice, ADJ_CLOSE_COL]

                # Unstack to get tickers as rows, dates as columns
                prices_unstacked = prices.unstack(level=date_level_name)

                # Rename columns for clarity
                prices_unstacked = prices_unstacked.rename(columns={buy_date: 'Buy Price', sell_date: 'Sell Price'})

                # Drop any tickers that *still* have NaN after unstacking (e.g., if data was missing for one of the dates)
                prices_unstacked = prices_unstacked.dropna()

                if prices_unstacked.empty:
                    # print(f"Info: Skipping {portf_name} - All remaining stocks had NaN prices after unstacking.")
                    skipped_count += 1
                    continue

                final_available_tickers = prices_unstacked.index.tolist()

            except KeyError as e:
                 print(f"Error: KeyError during price processing for {portf_name}: {e}. Check index slicing or column '{ADJ_CLOSE_COL}'. Skipping.")
                 skipped_count += 1
                 continue # Skip this portfolio
            except Exception as e:
                 print(f"Error: Unexpected error during price processing for {portf_name}: {e}. Skipping.")
                 traceback.print_exc()
                 skipped_count += 1
                 continue # Skip this portfolio

            # --- 3.6 Calculate Portfolio Return ---
            # Align weights to the tickers that actually have valid price data
            aligned_weights = selected_stocks_df.loc[final_available_tickers, 'Weight']

            # Normalize weights for the stocks used in calculation (sum might not be 1 if some were dropped)
            weight_sum = aligned_weights.sum()
            if weight_sum <= 1e-9: # Use tolerance for floating point comparison
                 # print(f"Info: Skipping {portf_name} - Sum of weights for available stocks is near zero ({weight_sum:.4f}).")
                 skipped_count += 1
                 continue
            normalized_weights = aligned_weights / weight_sum

            # Get final buy and sell prices
            buy_prices = prices_unstacked.loc[final_available_tickers, 'Buy Price']
            sell_prices = prices_unstacked.loc[final_available_tickers, 'Sell Price']

            # Handle potential zero prices in the denominator (can cause Inf returns)
            zero_price_mask = (buy_prices.abs() < 1e-9)
            if zero_price_mask.any():
                num_zero = zero_price_mask.sum()
                print(f"Warning: Found {num_zero} stock(s) with near-zero buy price in {portf_name} on {buy_date.date()}. Excluding them from return calculation.")
                valid_price_mask = ~zero_price_mask
                # Filter prices and weights to exclude zero-price stocks
                buy_prices = buy_prices[valid_price_mask]
                sell_prices = sell_prices[valid_price_mask]
                normalized_weights = normalized_weights.loc[valid_price_mask]

                # If excluding them removed all stocks, skip
                if buy_prices.empty:
                     print(f"Info: Skipping {portf_name} - No stocks remaining after excluding zero buy prices.")
                     skipped_count += 1
                     continue

                # Re-normalize weights after exclusion
                weight_sum = normalized_weights.sum()
                if weight_sum <= 1e-9:
                    print(f"Info: Skipping {portf_name} - Sum of weights became near-zero after removing zero-price stocks.")
                    skipped_count += 1
                    continue
                normalized_weights = normalized_weights / weight_sum

            # Final check for NaNs/Infs before calculation (should be rare after dropna)
            if buy_prices.isnull().any() or sell_prices.isnull().any() or normalized_weights.isnull().any() or \
               np.isinf(buy_prices).any() or np.isinf(sell_prices).any() or np.isinf(normalized_weights).any():
                print(f"Warning: Skipping {portf_name} - Found NaN or Inf in final arrays before calculation.")
                skipped_count += 1
                continue

            # Calculate individual stock returns (add small epsilon to avoid division by strict zero)
            individual_returns = (sell_prices / (buy_prices + 1e-12)) - 1

            # Calculate the weighted portfolio return for this period
            portfolio_return = (individual_returns * normalized_weights).sum()

            # --- 3.7 Store Results ---
            portfolio_returns_data.append({
                'portfolio_name': portf_name,
                'generation_date': portf_date,
                'buy_date': buy_date,
                'sell_date': sell_date,
                'raw_factor': raw_factor,
                'risk_adj_factor': risk_adj_factor,
                'penalty_factor': penalty_factor, # Store the penalty factor
                'portfolio_return': portfolio_return,
                'num_stocks_initial': len(original_portfolio_tickers),
                'num_stocks_calc': len(final_available_tickers) # Number used in calculation
            })
            processed_count += 1

        # Catch errors specific to processing a single portfolio
        except Exception as e:
            print(f"FATAL Error processing portfolio {portf_name}: {e}")
            print("--- Traceback for FATAL Error ---")
            traceback.print_exc()
            print("--- End Traceback ---")
            error_count += 1
            # Decide whether to continue processing other portfolios or stop
            # continue

    print(f"\nPortfolio return calculation finished.")
    print(f"Successfully processed: {processed_count}")
    print(f"Skipped (missing data/criteria): {skipped_count}")
    print(f"Errors during processing: {error_count}")

# =============================================================================
# 4. Analysis of Results
# =============================================================================
if portfolio_returns_data:
    # Convert the list of results into a DataFrame
    returns_df = pd.DataFrame(portfolio_returns_data)
    # Sort for easier viewing (optional)
    grouping_factors = ['raw_factor', 'risk_adj_factor', 'penalty_factor']
    returns_df = returns_df.sort_values(by=['generation_date'] + grouping_factors)

    print("\n--- Portfolio Return Calculation Summary ---")
    print(returns_df.head())
    print(f"\nTotal calculated return instances: {len(returns_df)}")

    # --- Group by factor combinations to analyze performance ---
    print("\n--- Performance Analysis by Factor Combination ---")

    # Define the grouping factors (ensure this list is defined correctly above)
    grouping_factors = ['raw_factor', 'risk_adj_factor', 'penalty_factor']

    # Group by all three factors and aggregate portfolio returns using keyword arguments
    performance_summary = returns_df.groupby(grouping_factors)['portfolio_return'].agg(
        # Simple aggregations by name
        mean='mean',
        median='median',
        # Custom aggregations using lambda functions
        std=lambda x: x.std(ddof=0) if pd.notna(x).sum() > 1 else np.nan, # Stddev meaningful only for >1 non-NA points
        count=lambda x: pd.notna(x).sum() # Count only valid (non-NA) returns
    )

    # Sort by mean return for easier interpretation
    performance_summary = performance_summary.sort_values(by='mean', ascending=False)
    print("\nPerformance per Factor Combination (sorted by mean return):")
    with pd.option_context('display.float_format', '{:.4f}'.format, 'display.max_rows', 200): # Show more rows if needed
        print(performance_summary)

    # --- Analyze the number of stocks used in calculation --- # (This part remains the same)
    stock_count_summary = returns_df.groupby(grouping_factors)['num_stocks_calc'].agg(['mean', 'min', 'max'])
    # ... rest of the printing ...


    with pd.option_context('display.float_format', '{:.4f}'.format, 'display.max_rows', 200): # Show more rows if needed
        print(performance_summary)

    # --- Analyze the number of stocks used in calculation ---
    stock_count_summary = returns_df.groupby(grouping_factors)['num_stocks_calc'].agg(['mean', 'min', 'max'])
    print("\nAverage/Min/Max Stocks Used in Calculation per Factor Combination:")
    with pd.option_context('display.float_format', '{:.2f}'.format):
        print(stock_count_summary)

else:
    print("\nNo portfolio returns were successfully calculated to analyze.")

# =============================================================================
# 5. Save Results (Optional)
# =============================================================================
output_returns_csv = 'portfolio_returns.csv'
output_summary_csv = 'portfolio_factor_performance.csv'

# Check if DataFrames exist before saving
if 'returns_df' in locals() and not returns_df.empty:
    try:
        returns_df.to_csv(output_returns_csv, index=False)
        print(f"\nIndividual portfolio returns saved to {output_returns_csv}")
    except Exception as e:
        print(f"\nError saving returns DataFrame to {output_returns_csv}: {e}")

if 'performance_summary' in locals() and not performance_summary.empty:
    try:
        # Reset index to save factor columns properly
        performance_summary.reset_index().to_csv(output_summary_csv, index=False)
        print(f"Performance summary saved to {output_summary_csv}")
    except Exception as e:
        print(f"\nError saving performance summary to {output_summary_csv}: {e}")

print("\n--- Script Finished ---")

--- Portfolio Performance Calculation Script ---

Checking and preparing df_data index structure...
Detected MultiIndex with names: ['Symbol', 'Date']
Using 'Symbol' for ticker level and 'Date' for date level.
Sorting df_data index...
Index sorted.

Trading dates extracted using level 'Date'. Found 251 unique dates (e.g., 2024-04-18 to 2025-04-17).

Calculating portfolio returns...

Portfolio return calculation finished.
Successfully processed: 320
Skipped (missing data/criteria): 0
Errors during processing: 0

--- Portfolio Return Calculation Summary ---
                                   portfolio_name generation_date   buy_date  sell_date  raw_factor  risk_adj_factor  penalty_factor  portfolio_return  num_stocks_initial  num_stocks_calc
0  2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.0      2025-04-14 2025-04-15 2025-04-16      0.5000           0.5000          0.0000           -0.0001                  35               35
1  2025-04-14_portf_rawF_0.5_riskAdjF_0.5_pen_0.1      2025-04