In [3]:
import sys
from pathlib import Path
import pandas as pd

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']


In [5]:
# Data from the Excel example
# data = {'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#         'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}
data = {'X': [1, -2, 3, -4, 5, -6, 7, -8, 9, -10],
        'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}

# Create the DataFrame
df = pd.DataFrame(data)

print(f'df:\n{df}\n')
for span in [1, 3, 21, 1e12]:
  print(f'\n\nspan:\n{span}')
  cov_matrix, corr_matrix = utils.get_cov_corr_ewm_matrices(df, span=21, return_corr=True, return_cov=True)
  print(f'\ncode_cov_matrix:\n{cov_matrix}')
  print(f'\ncode_corr_matrix:\n{corr_matrix}')

  # # print(f'\n\ncorrelation_matrix:\n{correlation_matrix}')
  print('==============')

df:
    X   Y
0   1   2
1  -2   4
2   3   6
3  -4   8
4   5  10
5  -6  12
6   7  14
7  -8  16
8   9  18
9 -10  20



span:
1

code_cov_matrix:
           X          Y
X  43.512963 -12.977287
Y -12.977287  67.675888

code_corr_matrix:
          X         Y
X  1.000000 -0.239143
Y -0.239143  1.000000


span:
3

code_cov_matrix:
           X          Y
X  43.512963 -12.977287
Y -12.977287  67.675888

code_corr_matrix:
          X         Y
X  1.000000 -0.239143
Y -0.239143  1.000000


span:
21

code_cov_matrix:
           X          Y
X  43.512963 -12.977287
Y -12.977287  67.675888

code_corr_matrix:
          X         Y
X  1.000000 -0.239143
Y -0.239143  1.000000


span:
1000000000000.0

code_cov_matrix:
           X          Y
X  43.512963 -12.977287
Y -12.977287  67.675888

code_corr_matrix:
          X         Y
X  1.000000 -0.239143
Y -0.239143  1.000000


In [1]:
import numpy as np
import pandas as pd

def corrected_ewm_corr(df, span=21):
    """
    Calculates the exponentially weighted moving (EWM) correlation matrix,
    correcting for potential biases introduced by standard EWM calculations
    and handling cases with zero variance.

    Args:
        df (pd.DataFrame): Input DataFrame containing time series data.  Each column represents a different asset.
        span (int, optional): The span parameter for the EWM calculation.  A larger span
                             gives more weight to recent data. Defaults to 21.

    Returns:
        pd.DataFrame: EWM correlation matrix, indexed and columned by the
                      columns of the input DataFrame.
    """
    alpha = 2 / (span + 1)

    ewm_mean = df.ewm(alpha=alpha, adjust=False).mean()
    demeaned = df - ewm_mean

    # Compute weights for valid observations
    weights = (1 - alpha) ** np.arange(len(df), 0, -1)
    weights /= weights.sum()

    # Compute covariance using clean data
    cov_matrix = np.einsum('t,tij->ij', weights,
                          np.einsum('ti,tj->tij', demeaned.values, demeaned.values))

    # Handle zero variances to avoid division by zero
    variances = np.diag(cov_matrix).copy()
    variances[variances <= 0] = 1e-10  # Prevent NaN/Inf during normalization
    std_devs = np.sqrt(variances)

    # Calculate correlation matrix
    correlation_matrix = cov_matrix / np.outer(std_devs, std_devs)

    return pd.DataFrame(correlation_matrix, index=df.columns, columns=df.columns)

# Data from the Excel example
# data = {'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#         'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}
data = {'X': [1, -2, 3, -4, 5, -6, 7, -8, 9, -10],
        'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}

# Create the DataFrame
df = pd.DataFrame(data)

print(f'df:\n{df}\n')
for span in [1, 3, 21, 1e12]:
  print(f'\n\nspan:\n{span}')
  correlation_matrix = corrected_ewm_corr(df, span=span)
  print(f'\ncode_corr_matrix:\n{correlation_matrix}')
  print(f'\npd_corr_matrix:\n{df.corr()}')

  # # print(f'\n\ncorrelation_matrix:\n{correlation_matrix}')
  print('==============')


# if __name__ == '__main__':
#     # Example usage:
#     # Create a sample DataFrame
#     np.random.seed(42)  # for reproducibility
#     data = np.random.randn(100, 3)
#     df = pd.DataFrame(data, columns=['Asset1', 'Asset2', 'Asset3'])

#     # Calculate the EWM correlation matrix
#     corr_matrix = corrected_ewm_corr(df, span=21)

#     # Print the correlation matrix
#     print("EWM Correlation Matrix:")
#     print(corr_matrix)

    # You can also visualize the correlation matrix using matplotlib or seaborn
    # import matplotlib.pyplot as plt
    # import seaborn as sns

    # plt.figure(figsize=(8, 6))
    # sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=.5)
    # plt.title('EWM Correlation Matrix')
    # plt.show()

df:
    X   Y
0   1   2
1  -2   4
2   3   6
3  -4   8
4   5  10
5  -6  12
6   7  14
7  -8  16
8   9  18
9 -10  20



span:
1

code_corr_matrix:
    X   Y
X NaN NaN
Y NaN NaN

pd_corr_matrix:
          X         Y
X  1.000000 -0.154807
Y -0.154807  1.000000


span:
3

code_corr_matrix:
          X         Y
X  1.000000 -0.382551
Y -0.382551  1.000000

pd_corr_matrix:
          X         Y
X  1.000000 -0.154807
Y -0.154807  1.000000


span:
21

code_corr_matrix:
          X         Y
X  1.000000 -0.239143
Y -0.239143  1.000000

pd_corr_matrix:
          X         Y
X  1.000000 -0.154807
Y -0.154807  1.000000


span:
1000000000000.0

code_corr_matrix:
          X         Y
X  1.000000 -0.279623
Y -0.279623  1.000000

pd_corr_matrix:
          X         Y
X  1.000000 -0.154807
Y -0.154807  1.000000


  weights /= weights.sum()


In [2]:
import numpy as np
import pandas as pd
import random
from pathlib import Path
import sys


# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

def corrected_ewm_cov(df, span=21):
    alpha = 2 / (span + 1)
    
    ewm_mean = df.ewm(alpha=alpha, adjust=False).mean()  # Remove shift(1)
    demeaned = df - ewm_mean
    
    # Compute weights for valid observations
    weights = (1 - alpha) ** np.arange(len(df), 0, -1)
    weights /= weights.sum()
    
    # Compute covariance using clean data
    cov_matrix = np.einsum('t,tij->ij', weights,
                          np.einsum('ti,tj->tij', demeaned.values, demeaned.values))

    return pd.DataFrame(cov_matrix, index=df.columns, columns=df.columns)

# Generate stationary data with constant mean
# np.random.seed(0)
# df = pd.DataFrame(np.random.randn(10, 2), columns=["A", "B"])

# # Data from the Excel example
# # data = {'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
# #         'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}
# data = {'X': [1, -2, 3, -4, 5, -6, 7, -8, 9, -10],
#         'Y': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}

# # Create the DataFrame
# df = pd.DataFrame(data)

print(f'df:\n{df}\n')
for span in [1, 3, 21, 1e12]:
  print(f'\n\nspan:\n{span}')
  covariance_matrix = corrected_ewm_cov(df, span=span)
  print(f'\ncode_cov_matrix:\n{covariance_matrix}')
  print(f'\npd_cov_matrix:\n{df.corr()}')

  # # print(f'\n\ncovariance_matrix:\n{covariance_matrix}')
  print('==============')
  # covariance_matrix = corrected_ewm_cov(df, span=span)



# df['B'] = -df['A'].copy()

# # Method 1: Compute EWM covariance and derive correlation
# cov_matrix_1 = corrected_ewm_cov(df, span=10000)
# variances = np.diag(cov_matrix_1)
# correlation_matrix_1 = cov_matrix_1 / np.outer(np.sqrt(variances), np.sqrt(variances))

# # Method 2: Standard Pearson correlation
# correlation_matrix_2 = df.corr()

# # Compare results
# print(f'df:\n{df}')
# print("Method 1 (EWM):\n", correlation_matrix_1)
# print("\nMethod 2 (Pearson):\n", correlation_matrix_2)

Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']
df:
    X   Y
0   1   2
1  -2   4
2   3   6
3  -4   8
4   5  10
5  -6  12
6   7  14
7  -8  16
8   9  18
9 -10  20



span:
1

code_cov_matrix:
    X   Y
X NaN NaN
Y NaN NaN

pd_cov_matrix:
          X         Y
X  1.000000 -0.154807
Y -0.154807  1.000000


span:
3

co

  weights /= weights.sum()


In [None]:
import numpy as np
import pandas as pd

def corrected_ewm_cov(df, span=21):
    alpha = 2 / (span + 1)
    ewm_mean = df.ewm(alpha=alpha, adjust=False).mean()  # Remove shift(1)
    demeaned = df - ewm_mean
    
    # Compute weights for valid observations
    weights = (1 - alpha) ** np.arange(len(df), 0, -1)
    weights /= weights.sum()
    
    # Compute covariance using clean data
    cov_matrix = np.einsum('t,tij->ij', weights,
                          np.einsum('ti,tj->tij', demeaned.values, demeaned.values))
    
    # Handle zero variances to avoid division by zero
    variances = np.diag(cov_matrix).copy()
    variances[variances <= 0] = 1e-10  # Prevent NaN
    std_devs = np.sqrt(variances)
    
    correlation_matrix = cov_matrix / np.outer(std_devs, std_devs)
    return pd.DataFrame(correlation_matrix, index=df.columns, columns=df.columns)


# Create the DataFrame
df = pd.DataFrame(data)

print(f'df:\n{df}\n')
for span in [1, 3, 21, 1e12]:
  print(f'\n\nspan:\n{span}')
  correlation_matrix = corrected_ewm_cov(df, span=span)
  print(f'\ncode_corr_matrix:\n{correlation_matrix}')
  print(f'\npd_corr_matrix:\n{df.corr()}')

  # # print(f'\n\ncorrelation_matrix:\n{correlation_matrix}')
  print('==============')
  # correlation_matrix = corrected_ewm_cov(df, span=span)


# # Generate stationary data with constant mean
# np.random.seed(0)
# df = pd.DataFrame(np.random.randn(100000, 2), columns=["A", "B"])
# # df['B'] = -df['A'].copy()

# # Method 1: Compute EWM covariance and derive correlation
# cov_matrix_1 = corrected_ewm_cov(df, span=10000)
# variances = np.diag(cov_matrix_1)
# correlation_matrix_1 = cov_matrix_1 / np.outer(np.sqrt(variances), np.sqrt(variances))

# # Method 2: Standard Pearson correlation
# correlation_matrix_2 = df.corr()

# # Compare results
# print(f'df:\n{df}')
# print("Method 1 (EWM):\n", correlation_matrix_1)
# print("\nMethod 2 (Pearson):\n", correlation_matrix_2)

In [None]:
#### String Representation of a List of Dictionaries

you are a highly skilled and experienced quantitative trader working for Goldman Sachs. you are also an expert python coder experience writing codes for quantitative trading. Your primary responsibility is to 
write python codes to identify and construct portfolios that will outperform the market based on rigorous data analysis and sophisticated financial modeling. 

Write python code to select 50 Tickers that is expected to outperform the market in the next 5, 10, 15, 30, 60 and 120 days. Use the correlation and or covariance matrices to reduce the 50 selected tickers to 10 tickers portfolio that is expected to outperform the market in the next 5, 10, 15, 30, 60 and 120 days.

You are given three pandas dataframes: df_data, df_correlation_matrix, and df_covariance_matrix.
df_data index:
---
Ticker
---  
df_data.info():
---
<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, UBS to PCVX
Data columns (total 49 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Dividend %       1118 non-null   float64
 1   Perf Week %      1379 non-null   float64
 2   Perf Month %     1379 non-null   float64
 3   Perf Quart %     1379 non-null   float64
 4   Perf Half %      1379 non-null   float64
 5   Perf Year %      1379 non-null   float64
 6   Perf YTD %       1379 non-null   float64
 7   Beta             1378 non-null   object 
 8   ATR              1379 non-null   object 
 9   Volatility W %   1379 non-null   float64
 10  Volatility M %   1379 non-null   float64
 11  SMA20 %          1379 non-null   float64
 12  SMA50 %          1379 non-null   float64
 13  SMA200 %         1379 non-null   float64
 14  50D High %       1379 non-null   float64
 15  50D Low %        1379 non-null   float64
 16  52W High %       1379 non-null   float64
 17  52W Low %        1379 non-null   float64
 18  All-Time High %  1378 non-null   float64
 19  All-Time Low %   1377 non-null   float64
 20  RSI              1379 non-null   object 
 21  Gap %            1379 non-null   float64
 22  Rel Volume       1379 non-null   object 
 23  Price            1379 non-null   object 
 24  Change %         1379 non-null   float64
 25  MktCap AUM, M    980 non-null    float64
 26  Avg Volume, M    1379 non-null   float64
 27  Volume, M        1379 non-null   float64
 28  Sharpe 5d        1379 non-null   float64
 29  Sortino 5d       1379 non-null   float64
 30  Omega 5d         1329 non-null   float64
 31  Sharpe 10d       1379 non-null   float64
 32  Sortino 10d      1379 non-null   float64
 33  Omega 10d        1379 non-null   float64
 34  Sharpe 15d       1379 non-null   float64
 35  Sortino 15d      1379 non-null   float64
 36  Omega 15d        1379 non-null   float64
 37  Sharpe 30d       1379 non-null   float64
 38  Sortino 30d      1379 non-null   float64
 39  Omega 30d        1379 non-null   float64
 40  Sharpe 60d       1379 non-null   float64
 41  Sortino 60d      1379 non-null   float64
 42  Omega 60d        1379 non-null   float64
 43  Sharpe 120d      1379 non-null   float64
 44  Sortino 120d     1379 non-null   float64
 45  Omega 120d       1379 non-null   float64
 46  Sharpe 250d      1379 non-null   float64
 47  Sortino 250d     1379 non-null   float64
 48  Omega 250d       1379 non-null   float64
dtypes: float64(44), object(5)
memory usage: 538.7+ KB
---
df_data column descripton:
---  
Ticker: The stock symbol or identifier for the company.

Dividend %: The annual dividend yield expressed as a percentage of the current stock price.

Perf Week %: The percentage price change over the past week.

Perf Month %: The percentage price change over the past month.

Perf Quart %: The percentage price change over the past quarter (3 months).

Perf Half %: The percentage price change over the past half-year (6 months).

Perf Year %: The percentage price change over the past year (52 weeks).

Perf YTD %: The percentage price change from the beginning of the current calendar year to date.

Beta: A measure of a stock's volatility relative to the overall market (usually S&P 500). A beta of 1 indicates the stock moves with the market. Greater than 1 is more volatile, less than 1 is less volatile.

ATR (Average True Range): A technical analysis indicator measuring the average price range of a stock over a specific period (often 14 days). It represents the stock's volatility.

Volatility W %: The volatility (standard deviation of price changes) calculated over the past week.

Volatility M %: The volatility calculated over the past month.

SMA20 %: The percentage difference between the current price and the 20-day Simple Moving Average.

SMA50 %: The percentage difference between the current price and the 50-day Simple Moving Average.

SMA200 %: The percentage difference between the current price and the 200-day Simple Moving Average.

50D High %: The percentage difference between the current price and the 50-day high price.

50D Low %: The percentage difference between the current price and the 50-day low price.

52W High %: The percentage difference between the current price and the 52-week high price.

52W Low %: The percentage difference between the current price and the 52-week low price.

All-Time High %: The percentage difference between the current price and the all-time high price.

All-Time Low %: The percentage difference between the current price and the all-time low price.

RSI (Relative Strength Index): A momentum oscillator that measures the speed and change of price movements. Ranges from 0-100, commonly used to identify overbought (above 70) and oversold (below 30) conditions.

Gap %: The percentage difference between the previous day's closing price and the current day's opening price.

Rel Volume (Relative Volume): The ratio of the current trading volume to the average trading volume over a specified period (often the past 30 days). Values above 1 indicate higher than average volume.

Price: The current trading price of the stock.

Change %: The percentage change in the stock price from the previous day's close.

MktCap AUM, M (Market Capitalization or Assets Under Management, Millions): The total market value of a company's outstanding shares (market cap) or the total value of assets managed by a fund (AUM), expressed in millions of dollars.

Avg Volume, M (Average Volume, Millions): The average number of shares traded per day, expressed in millions.

Volume, M (Volume, Millions): The number of shares traded on the current day, expressed in millions.

Sharpe (5d, 10d, 15d, 30d, 60d, 120d, 250d): The Sharpe ratio measures risk-adjusted return. It represents the excess return earned per unit of risk (volatility). Higher is better. Calculated over different timeframes (5 days, 10 days, etc.).

Sortino (5d, 10d, 15d, 30d, 60d, 120d, 250d): Similar to the Sharpe ratio, but only penalizes downside risk (negative volatility). Higher is better. Calculated over different timeframes.

Omega (5d, 10d, 15d, 30d, 60d, 120d, 250d): A risk-return performance measure that uses the entire return distribution. Values above 1 indicate more upside potential than downside risk. Calculated over different timeframes.
---
df_correlation_matrix.info():
---
<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, UBS to PCVX
Columns: 1379 entries, UBS to PCVX
dtypes: float64(1379)
memory usage: 14.5+ MB
---
df_covariance_matrix.info():
---
<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, UBS to PCVX
Columns: 1379 entries, UBS to PCVX
dtypes: float64(1379)
memory usage: 14.5+ MB
---

In [None]:
import sys
from pathlib import Path
import pandas as pd

# # Notebook cell
# %load_ext autoreload
# %autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils


SOURCE_PATH, _ = utils.main_processor(
    data_dir='..\data',  # search project ..\data
    downloads_dir='',  # None searchs Downloads dir, '' omits search
    downloads_limit=10,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='df_finviz_n_ratios' # search for files starting with 'df_'
)



In [None]:
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
# pd.set_option('display.width', None)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell

In [None]:
df = pd.read_pickle(SOURCE_PATH)
display(df.info(), df.head())

In [None]:
# Drop 'Company' column from the DataFrame
# df_data = df.drop('Company', axis=1)
df_data = df.copy()
df_data

In [None]:
# Convert DataFrame to list of dictionaries with Ticker as index
finviz_dict = df_finviz_ratios.reset_index().to_dict('records')

# Preview the first 2 entries
print("First 2 entries of the list:")
print(finviz_dict[:2])

# Print total number of records
print(f"\nTotal number of records: {len(finviz_dict)}")
# finviz_dict[:2]
finviz_dict

In [None]:
df_finviz_ratios.info()

In [None]:
SOURCE_PATH, _ = utils.main_processor(
    data_dir='..\data',  # search project ..\data
    downloads_dir=None,  # None searchs Downloads dir, '' omits search
    downloads_limit=10,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='df_correlation' # search for files starting with 'df_'
)

In [None]:
df_correlation_matrix = pd.read_pickle(SOURCE_PATH)
display(df.info(), df.head())

In [None]:
# Convert DataFrame to list of dictionaries with Ticker as index
correlation_dict = df.reset_index().to_dict('records')

# Preview the first 1 entries
print("First 1 entries of the list:")
print(correlation_dict[:1])

# Print total number of records
print(f"\nTotal number of records: {len(correlation_dict)}")
# correlation_dict[:1]
correlation_dict

In [None]:
SOURCE_PATH, _ = utils.main_processor(
    data_dir='..\data',  # search project ..\data
    downloads_dir=None,  # None searchs Downloads dir, '' omits search
    downloads_limit=10,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='df_covariance' # search for files starting with 'df_'
)

In [None]:
df_covariance_matrix = pd.read_pickle(SOURCE_PATH)
display(df_covariance_matrix.info(), df_covariance_matrix.head())

In [None]:
# Convert DataFrame to list of dictionaries with Ticker as index
covariance_dict = df.reset_index().to_dict('records')

# Preview the first 1 entries
print("First 1 entries of the list:")
print(covariance_dict[:1])

# Print total number of records
print(f"\nTotal number of records: {len(covariance_dict)}")

In [None]:
# ===================================================================

In [None]:
# Sample data generation (replace with your actual data loading)
# def generate_sample_data():
#     tickers = ['PUK', 'OKTA', 'CVS', 'TEF', 'KEP', 
#               'ABEV', 'LYG', 'TAK', 'ICL', 'SAN',
#               'FXI', 'NOK', 'ITCI', 'EBR', 'XPEV']
    
#     # Create sample df_data
#     np.random.seed(42)
#     data = {
#         'Sharpe 5d': np.random.normal(1, 0.5, 15),
#         'Sortino 5d': np.random.normal(1.2, 0.3, 15),
#         'Volatility W %': np.abs(np.random.normal(1.5, 0.5, 15)),
#         # Add other required columns with random values...
#     }
#     df_data = pd.DataFrame(data, index=tickers)
    
#     # Create sample correlation matrix
#     corr_matrix = pd.DataFrame(
#         np.random.uniform(-0.2, 0.8, (15, 15)),
#         index=tickers, columns=tickers
#     )
#     np.fill_diagonal(corr_matrix.values, 1)
    
#     # Create sample covariance matrix
#     cov_matrix = pd.DataFrame(
#         np.random.uniform(0.5, 2.5, (15, 15)),
#         index=tickers, columns=tickers
#     )
#     np.fill_diagonal(cov_matrix.values, 1)
    
#     return df_data, corr_matrix, cov_matrix

# Execution block
if __name__ == "__main__":
    print("🚀 Generating sample data...")
    # df_data_sample, corr_sample, cov_sample = generate_sample_data()
    
    print("\n🔧 Running portfolio optimizer...")
    final_portfolio = portfolio_optimizer(
        df_data_sample, 
        corr_sample,
        cov_sample
    )
    
    print("\n✅ Execution completed. Final portfolio:")
    print(final_portfolio)

In [None]:
print("\n🔧 Running portfolio optimizer...")
final_portfolio = portfolio_optimizer(
    df_data, 
    df_correlation_matrix,
    df_correlation_matrix
)

print("\n✅ Execution completed. Final portfolio:")
print(final_portfolio)

In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

def portfolio_optimizer(df_data, df_correlation_matrix, df_covariance_matrix):
    # [Previous preprocessing and scoring code remains identical...]

    # Modified final selection section with explanations
    print("\n" + "="*80)
    print("🎯 Final Portfolio Selection with Risk Adjustment")
    print("="*80)
    
    final_portfolio = []
    for cluster_id in sorted(cluster_df['cluster'].unique()):
        cluster_members = cluster_df[cluster_df['cluster'] == cluster_id].copy()
        selected = cluster_members.nlargest(1, 'risk_adj_score')
        final_portfolio.append(selected['ticker'].values[0])
        
        # Cluster-specific explanation
        print(f"\n📦 Cluster {cluster_id} ({len(cluster_members)} members):")
        print(f"   🏆 Selected: {selected['ticker'].values[0]} "
              f"(Score: {selected['score'].values[0]:.2f}, "
              f"Volatility: {np.sqrt(selected['variance'].values[0]):.2f})")
        
        # Special case explanation for Cluster 1
        if cluster_id == 1 and 'OKTA' in cluster_members.ticker.values:
            okta = cluster_members[cluster_members.ticker == 'OKTA'].iloc[0]
            puk = cluster_members[cluster_members.ticker == 'PUK'].iloc[0]
            
            print("\n   🔍 Selection Rationale:")
            print(f"   - OKTA Score: {okta.score:.2f} vs PUK Score: {puk.score:.2f}")
            print(f"   - OKTA Volatility: {np.sqrt(okta.variance):.2f} vs PUK Volatility: {np.sqrt(puk.variance):.2f}")
            print(f"   - Risk-Adjusted Scores:")
            print(f"     OKTA: {okta.score/np.sqrt(okta.variance):.2f}")
            print(f"     PUK:  {puk.score/np.sqrt(puk.variance):.2f}")
            print("   - Despite lower raw score, PUK provides better risk efficiency")
            print("   - Cluster correlation 0.67 indicates high similarity")
            print("   - Lower volatility allows larger position sizing")
        
        print(f"   📊 Members: {', '.join(cluster_members['ticker'].tolist())}")

    # New explanatory section
    print("\n" + "="*80)
    print("📚 Portfolio Selection Methodology Documentation")
    print("="*80)
    print("1. Composite Score Components:")
    print("   - 25% Sharpe Ratio (5-120 days)")
    print("   - 25% Sortino Ratio (5-120 days)")
    print("   - 20% Omega Ratio (5-120 days)")
    print("   - 15% Momentum Factors (Weekly-YTD)")
    print("   - 10% SMA Positioning (20-200 days)")
    print("   - -5% Volatility Penalty (Weekly/Monthly)")
    
    print("\n2. Cluster Selection Criteria:")
    print("   - Ward's hierarchical clustering on correlation matrix")
    print("   - Optimal cluster count determined by market regime analysis")
    print("   - Risk-adjusted score = Composite Score / √Variance")
    print("   - Variance derived from covariance matrix diagonal")
    
    print("\n3. Key Optimization Tradeoffs:")
    print("   - Balances raw performance vs risk efficiency")
    print("   - Penalizes correlated positions via cluster limits")
    print("   - Prevents overexposure to single risk factors")
    print("   - Prefers lower volatility at equal risk-adjusted returns")
    
    print("\n4. Example: PUK vs OKTA Decision")
    print("   - OKTA: Higher raw score (1.79 vs 1.43)")
    print("   - PUK:  Lower volatility (1.28 vs 1.60 estimated)")
    print("   - Risk-Adjusted Score Calculation:")
    print("     PUK: 1.43/1.28 = 1.12")
    print("     OKTA: 1.79/1.60 = 1.12")
    print("   - Tiebreaker: Lower volatility allows larger position size")
    print("   - Correlation 0.67 makes them partial substitutes")

    # [Remaining output code unchanged...]

    return final_portfolio

In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Configure display settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.2f}'.format)

def portfolio_optimizer(df_data, df_correlation_matrix, df_covariance_matrix):
    # Stage 1: Data Preparation
    print("⏳ Preprocessing data...")
    object_cols = ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price']
    for col in object_cols:
        df_data[col] = pd.to_numeric(
            df_data[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True),
            errors='coerce'
        )

    # Scoring configuration
    time_horizons = [5, 10, 15, 30, 60, 120]
    feature_weights = {
        'sharpe': 0.25,
        'sortino': 0.25,
        'omega': 0.20,
        'momentum': 0.15,
        'sma': 0.10,
        'volatility': -0.05
    }

    # Column definitions
    sharpe_cols = [f'Sharpe {days}d' for days in time_horizons]
    sortino_cols = [f'Sortino {days}d' for days in time_horizons]
    omega_cols = [f'Omega {days}d' for days in time_horizons]
    momentum_cols = ['Perf Week %', 'Perf Month %', 'Perf Quart %', 
                    'Perf Half %', 'Perf Year %', 'Perf YTD %']
    sma_cols = ['SMA20 %', 'SMA50 %', 'SMA200 %']
    volatility_cols = ['Volatility W %', 'Volatility M %']

    # Data cleaning
    print("🧹 Cleaning dataset...")
    required_cols = (sharpe_cols + sortino_cols + omega_cols + 
                    momentum_cols + sma_cols + volatility_cols)
    clean_mask = df_data[required_cols].notna().all(axis=1)
    df_clean = df_data.loc[clean_mask].copy()
    
    if len(df_clean) < 50:
        raise ValueError(f"Only {len(df_clean)} valid tickers after cleaning")

    # Composite score calculation
    print("🧮 Calculating composite scores...")
    def calculate_weighted_score(df):
        components = {}
        for category, cols in [('sharpe', sharpe_cols),
                              ('sortino', sortino_cols),
                              ('omega', omega_cols),
                              ('momentum', momentum_cols),
                              ('sma', sma_cols),
                              ('volatility', volatility_cols)]:
            z_scores = df[cols].apply(lambda x: (x - x.mean()) / x.std())
            components[category] = z_scores.mean(axis=1) * feature_weights[category]
        return pd.concat(components, axis=1).sum(axis=1)
    
    df_clean['composite_score'] = calculate_weighted_score(df_clean)

    # Top 50 Selection
    print("\n" + "="*80)
    print("🔝 Stage 1: Top 50 Ticker Selection")
    print("="*80)
    print("\n📊 Selection Criteria for Top 50:")
    print("- Weighted combination of risk-adjusted returns and momentum factors")
    print("- Components: Sharpe(25%), Sortino(25%), Omega(20%), Momentum(15%), SMA(10%), Volatility(-5%)")
    print(f"- Time horizons: {', '.join(map(str, time_horizons))} days")

    top_50 = df_clean.nlargest(50, 'composite_score')
    top_50_tickers = top_50.index.tolist()
    
    print("\n🏆 Top 50 Candidates (5-Column Format):")
    top_50_sorted = top_50[['composite_score']].sort_values('composite_score', ascending=False).round(2)
    for i in range(0, 50, 5):
        row = []
        for j in range(5):
            try:
                idx, score = top_50_sorted.iloc[i+j].name, top_50_sorted.iloc[i+j].values[0]
                row.append(f"{idx:5} {score:4.2f}")
            except IndexError:
                break
        print(" | ".join(row))

    # Stage 2: Portfolio Optimization
    print("\n" + "="*80)
    print("📊 Stage 2: Portfolio Optimization (50 → 10 Tickers)")
    print("="*80)
    print("\n🔀 Cluster-Based Selection Strategy:")
    print("1. 10 clusters from correlation patterns using Ward's method")
    print("2. 1 ticker selected per cluster regardless of size")
    print("3. Intra-cluster selection by highest risk-adjusted score")
    print("4. Diversification enforced through correlation matrix clustering")

    # Cluster analysis
    corr_subset = df_correlation_matrix.loc[top_50_tickers, top_50_tickers]
    distance_matrix = 1 - np.abs(corr_subset)
    np.fill_diagonal(distance_matrix.values, 0)
    linkage_matrix = linkage(squareform(distance_matrix), method='ward')
    clusters = fcluster(linkage_matrix, t=10, criterion='maxclust')

    # Cluster dataframe construction
    cluster_df = pd.DataFrame({
        'ticker': top_50_tickers,
        'cluster': clusters,
        'score': top_50['composite_score']
    }).merge(
        df_clean[['Price', 'MktCap AUM, M', 'Volatility M %']],
        left_on='ticker',
        right_index=True
    )

    # Risk calculations
    epsilon = 1e-6
    cluster_df = cluster_df.assign(
        variance=cluster_df['ticker'].apply(lambda x: df_covariance_matrix.loc[x, x]),
        risk_adj_score=lambda x: x['score'] / (np.sqrt(x['variance']) + epsilon)
    )

    # Cluster statistics
    print("\n📈 Cluster Statistics:")
    cluster_stats = cluster_df.groupby('cluster').agg(
        Size=('ticker', 'count'),
        Avg_Correlation=('ticker', lambda x: corr_subset.loc[x,x].values.mean()),
        Avg_Score=('score', 'mean'),
    ).reset_index().round(2)
    print(cluster_stats[['cluster', 'Size', 'Avg_Correlation', 'Avg_Score']].to_string(index=False))

    # Final selection
    print("\n" + "="*80)
    print("🎯 Final Portfolio Selection with Risk Adjustment")
    print("="*80)
    
    final_portfolio = []
    for cluster_id in sorted(cluster_df['cluster'].unique()):
        cluster_members = cluster_df[cluster_df['cluster'] == cluster_id].copy()
        selected = cluster_members.nlargest(1, 'risk_adj_score')
        final_portfolio.append(selected['ticker'].values[0])
        
        print(f"\n📦 Cluster {cluster_id} ({len(cluster_members)} members):")
        print(f"   🏆 Selected: {selected['ticker'].values[0]} "
              f"(Score: {selected['score'].values[0]:.2f}, "
              f"Volatility: {np.sqrt(selected['variance'].values[0]):.2f})")
        print(f"   📊 Members: {', '.join(cluster_members['ticker'].tolist())}")

    # Final output
    print("\n" + "="*80)
    print("💼 Optimized 10-Ticker Portfolio:")
    print(final_portfolio)
    print("="*80)
    print("Rationale: Combines correlation-based diversification with volatility-aware selection.")
    print("1. Enhanced scoring incorporates explicit volatility penalty")
    print("2. Cluster selection uses covariance-derived risk adjustment")
    print("3. Maintains exposure to multiple market regimes through correlation clustering")

    return final_portfolio

# Example execution
if __name__ == "__main__":
    # Load your dataframes here
    # df_data = ...
    # df_correlation_matrix = ...
    # df_covariance_matrix = ...
    
    optimized_portfolio = portfolio_optimizer(df_data, df_correlation_matrix, df_covariance_matrix)

In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Configure display settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.2f}'.format)

def main(df_data, df_correlation_matrix, df_covariance_matrix):
    # [Previous preprocessing and cleaning code remains identical...]
    
    # Calculate composite scores
    print("🧮 Calculating composite scores...")
    
    # [Composite score calculation code remains identical...]

    print("\n" + "="*80)
    print("🔝 Stage 1: Top 50 Ticker Selection")
    print("="*80)
    
    print("\n📊 Selection Criteria for Top 50:")
    print("- Weighted combination of risk-adjusted returns and momentum factors")
    print("- Components: Sharpe(25%), Sortino(25%), Omega(20%), Momentum(20%), SMA(10%)")
    print(f"- Time horizons: {', '.join(map(str, time_horizons))} days")

    top_50 = df_clean.nlargest(50, 'composite_score')
    top_50_sorted = top_50[['composite_score']].sort_values('composite_score', ascending=False).round(2)

    print("\n🏆 Top 50 Candidates (5-Column Format):")
    for i in range(0, 50, 5):
        row = []
        for j in range(5):
            try:
                idx, score = top_50_sorted.iloc[i+j].name, top_50_sorted.iloc[i+j].values[0]
                row.append(f"{idx:5} {score:4.2f}")
            except IndexError:
                break
        print(" | ".join(row))

    # [Cluster analysis code remains identical until cluster_stats...]

    print("\n🔀 Cluster-Based Selection Strategy:")
    print("1. 10 clusters from correlation patterns using Ward's method")
    print("2. 1 ticker selected per cluster regardless of size")
    print("3. Intra-cluster selection by highest composite score")
    print("4. Diversification enforced through correlation matrix clustering")

    print("\n📈 Cluster Statistics:")
    cluster_stats = cluster_df.groupby('cluster').agg(
        Size=('ticker', 'count'),
        Avg_Correlation=('ticker', lambda x: corr_subset.loc[x,x].values.mean()),
        Avg_Score=('score', 'mean'),
    ).reset_index()
    print(cluster_stats[['cluster', 'Size', 'Avg_Correlation', 'Avg_Score']]
          .round(2).to_string(index=False))

    # [Final portfolio selection code remains identical...]
    
    print("\n" + "="*80)
    print("💼 Final 10-Ticker Portfolio:")
    print(final_portfolio)
    print("="*80)
    print("Rationale: Diversified across 10 distinct market behavior patterns")
    print("identified through correlation clustering, while selecting the")
    print("highest-rated tickers in each cluster based on multi-factor scoring.")

    return final_portfolio

# [Rest of the code remains identical...]

In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Configure display settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.2f}'.format)

def main(df_data, df_correlation_matrix, df_covariance_matrix):
    # Preprocess data
    print("⏳ Preprocessing data...")
    object_cols = ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price']
    for col in object_cols:
        df_data[col] = pd.to_numeric(
            df_data[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True),
            errors='coerce'
        )

    # Enhanced scoring parameters
    time_horizons = [5, 10, 15, 30, 60, 120]
    feature_weights = {
        'sharpe': 0.25,
        'sortino': 0.25,
        'omega': 0.20,
        'momentum': 0.15,
        'sma': 0.10,
        'volatility': -0.05
    }

    sharpe_cols = [f'Sharpe {days}d' for days in time_horizons]
    sortino_cols = [f'Sortino {days}d' for days in time_horizons]
    omega_cols = [f'Omega {days}d' for days in time_horizons]
    momentum_cols = ['Perf Week %', 'Perf Month %', 'Perf Quart %', 
                    'Perf Half %', 'Perf Year %', 'Perf YTD %']
    sma_cols = ['SMA20 %', 'SMA50 %', 'SMA200 %']
    volatility_cols = ['Volatility W %', 'Volatility M %']

    # Clean data
    print("🧹 Cleaning dataset...")
    required_cols = (sharpe_cols + sortino_cols + omega_cols + 
                    momentum_cols + sma_cols + volatility_cols)
    clean_mask = df_data[required_cols].notna().all(axis=1)
    df_clean = df_data.loc[clean_mask].copy()

    if len(df_clean) < 50:
        raise ValueError(f"Only {len(df_clean)} valid tickers after cleaning")

    # Calculate composite scores
    print("🧮 Calculating enhanced composite scores...")
    def calculate_weighted_score(df):
        score_components = {}
        
        for category, cols in [('sharpe', sharpe_cols),
                              ('sortino', sortino_cols),
                              ('omega', omega_cols),
                              ('momentum', momentum_cols),
                              ('sma', sma_cols),
                              ('volatility', volatility_cols)]:
            z_scores = df[cols].apply(lambda x: (x - x.mean()) / x.std())
            score_components[category] = z_scores.mean(axis=1) * feature_weights[category]
        
        return pd.concat(score_components, axis=1).sum(axis=1)

    df_clean['composite_score'] = calculate_weighted_score(df_clean)

    # Select top 50
    print("\n" + "="*80)
    print("🔝 Stage 1: Top 50 Ticker Selection")
    print("="*80)
    top_50 = df_clean.nlargest(50, 'composite_score')
    top_50_tickers = top_50.index.tolist()

    # Cluster analysis
    print("\n" + "="*80)
    print("📊 Stage 2: Portfolio Optimization (50 → 10 Tickers)")
    print("="*80)
    corr_subset = df_correlation_matrix.loc[top_50_tickers, top_50_tickers]

    # Hierarchical clustering
    distance_matrix = 1 - np.abs(corr_subset)
    np.fill_diagonal(distance_matrix.values, 0)
    linkage_matrix = linkage(squareform(distance_matrix), method='ward')
    clusters = fcluster(linkage_matrix, t=10, criterion='maxclust')

    # Create cluster analysis
    cluster_df = pd.DataFrame({
        'ticker': top_50_tickers,
        'cluster': clusters,
        'score': top_50['composite_score']
    }).merge(
        df_clean[['Price', 'MktCap AUM, M', 'Volatility M %']],
        left_on='ticker',
        right_index=True
    )

    # Precompute risk metrics
    print("\n📈 Calculating cluster risk metrics...")
    epsilon = 1e-6
    cluster_df = cluster_df.assign(
        variance=cluster_df['ticker'].apply(lambda x: df_covariance_matrix.loc[x, x]),
        risk_adj_score=lambda x: x['score'] / (np.sqrt(x['variance']) + epsilon)
    )

    # Final selection
    print("\n" + "="*80)
    print("🎯 Final Portfolio Selection with Risk Adjustment")
    print("="*80)

    final_portfolio = []
    for cluster_id in sorted(cluster_df['cluster'].unique()):
        cluster_mask = cluster_df['cluster'] == cluster_id
        cluster_members = cluster_df[cluster_mask].copy()
        
        selected = cluster_members.nlargest(1, 'risk_adj_score')
        final_portfolio.append(selected['ticker'].values[0])
        
        print(f"\n📦 Cluster {cluster_id} ({len(cluster_members)} members):")
        print(f"   🏆 Selected: {selected['ticker'].values[0]} "
              f"(Score: {selected['score'].values[0]:.2f}, "
              f"Volatility: {np.sqrt(selected['variance'].values[0]):.2f})")
        print(f"   📊 Members: {', '.join(cluster_members['ticker'].tolist())}")

    print("\n" + "="*80)
    print("💼 Optimized 10-Ticker Portfolio:")
    print(final_portfolio)
    print("="*80)
    print("Rationale: Combines correlation-based diversification with volatility-aware selection.")
    print("1. Enhanced scoring incorporates explicit volatility penalty")
    print("2. Cluster selection uses covariance-derived risk adjustment")
    print("3. Maintains exposure to multiple market regimes through correlation clustering")

    return final_portfolio

# Execute the main function
if __name__ == "__main__":
    # Assuming dataframes are loaded externally
    final_portfolio = main(df_data, df_correlation_matrix, df_covariance_matrix)

In [None]:
# ==================================================

In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Configure display settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.2f}'.format)

# Preprocess data
print("⏳ Preprocessing data...")
object_cols = ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price']
for col in object_cols:
    df_data[col] = pd.to_numeric(
        df_data[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True),
        errors='coerce'
    )

# Define scoring parameters
time_horizons = [5, 10, 15, 30, 60, 120]
feature_weights = {
    'sharpe': 0.25,
    'sortino': 0.25,
    'omega': 0.20,
    'momentum': 0.20,
    'sma': 0.10
}

sharpe_cols = [f'Sharpe {days}d' for days in time_horizons]
sortino_cols = [f'Sortino {days}d' for days in time_horizons]
omega_cols = [f'Omega {days}d' for days in time_horizons]
momentum_cols = ['Perf Week %', 'Perf Month %', 'Perf Quart %', 
                 'Perf Half %', 'Perf Year %', 'Perf YTD %']
sma_cols = ['SMA20 %', 'SMA50 %', 'SMA200 %']

# Clean data
print("🧹 Cleaning dataset...")
required_cols = sharpe_cols + sortino_cols + omega_cols + momentum_cols + sma_cols
clean_mask = df_data[required_cols].notna().all(axis=1)
df_clean = df_data.loc[clean_mask].copy()

if len(df_clean) < 50:
    raise ValueError(f"Only {len(df_clean)} valid tickers after cleaning, need at least 50")

# Calculate composite scores
print("🧮 Calculating composite scores...")
def calculate_weighted_score(df):
    score_components = {}
    
    for category, cols in [('sharpe', sharpe_cols),
                          ('sortino', sortino_cols),
                          ('omega', omega_cols),
                          ('momentum', momentum_cols),
                          ('sma', sma_cols)]:
        z_scores = df[cols].apply(lambda x: (x - x.mean()) / x.std())
        score_components[category] = z_scores.mean(axis=1) * feature_weights[category]
    
    return pd.concat(score_components, axis=1).sum(axis=1)

df_clean['composite_score'] = calculate_weighted_score(df_clean)

# Select top 50
print("\n" + "="*80)
print("🔝 Stage 1: Top 50 Ticker Selection")
print("="*80)
top_50 = df_clean.nlargest(50, 'composite_score')
top_50_tickers = top_50.index.tolist()

print("\n📊 Selection Criteria for Top 50:")
print("- Weighted combination of risk-adjusted returns and momentum factors")
print("- Components: Sharpe(25%), Sortino(25%), Omega(20%), Momentum(20%), SMA(10%)")
print(f"- Time horizons: {', '.join(map(str, time_horizons))} days")

print("\n🏆 Top 50 Candidates (5-Column Format):")
top_50_sorted = top_50[['composite_score']].sort_values('composite_score', ascending=False).round(2)
for i in range(0, 50, 5):
    row = []
    for j in range(5):
        try:
            idx, score = top_50_sorted.iloc[i+j].name, top_50_sorted.iloc[i+j].values[0]
            row.append(f"{idx:5} {score:4.2f}")
        except IndexError:
            break
    print(" | ".join(row))

# Cluster analysis
print("\n" + "="*80)
print("📊 Stage 2: Portfolio Optimization (50 → 10 Tickers)")
print("="*80)

print("\n🔀 Cluster-Based Selection Strategy:")
print("1. 10 clusters from correlation patterns using Ward's method")
print("2. 1 ticker selected per cluster regardless of size")
print("3. Intra-cluster selection by highest composite score")
print("4. Diversification enforced through correlation matrix clustering")

corr_subset = df_correlation_matrix.loc[top_50_tickers, top_50_tickers]

# Hierarchical clustering
distance_matrix = 1 - np.abs(corr_subset)
np.fill_diagonal(distance_matrix.values, 0)

linkage_matrix = linkage(squareform(distance_matrix), method='ward')
clusters = fcluster(linkage_matrix, t=10, criterion='maxclust')

# Create cluster analysis
cluster_df = pd.DataFrame({
    'ticker': top_50_tickers,
    'cluster': clusters,
    'score': top_50['composite_score']
}).merge(
    df_clean[['Price', 'MktCap AUM, M', 'Volatility M %']],
    left_on='ticker',
    right_index=True
)

# Print cluster stats
print("\n📈 Cluster Statistics:")
cluster_stats = cluster_df.groupby('cluster').agg(
    Tickers=('ticker', list),
    Avg_Score=('score', 'mean'),
    Size=('ticker', 'count'),
    Avg_Correlation=('ticker', lambda x: corr_subset.loc[x,x].values.mean()),
).reset_index()

print(cluster_stats[['cluster', 'Size', 'Avg_Correlation', 'Avg_Score']].to_string(index=False))

# Final selection
print("\n" + "="*80)
print("🎯 Final Portfolio Selection")
print("="*80)

final_portfolio = []
for cluster_id in sorted(cluster_df['cluster'].unique()):
    cluster_data = cluster_df[cluster_df['cluster'] == cluster_id]
    selected = cluster_data.nlargest(1, 'score')
    final_portfolio.append(selected['ticker'].values[0])
    
    print(f"\n📦 Cluster {cluster_id} ({cluster_data.shape[0]} members):")
    print(f"   🏆 Selected: {selected['ticker'].values[0]} (Score: {selected['score'].values[0]:.2f})")
    print(f"   📈 Avg Cluster Correlation: {cluster_stats[cluster_stats['cluster'] == cluster_id]['Avg_Correlation'].values[0]:.2f}")
    print(f"   📊 Members: {', '.join(cluster_data['ticker'].tolist())}")

print("\n" + "="*80)
print("💼 Final 10-Ticker Portfolio:")
print(final_portfolio)
print("="*80)
print("Rationale: Diversified across 10 distinct market behavior patterns")
print("identified through correlation clustering, while selecting the")
print("highest-rated tickers in each cluster based on multi-factor scoring.")

In [None]:
['OKTA', 'EBR', 'KEP', 'TEF', 'HALO', 'ING', 'XPEV', 'BBVA', 'BABA', 'ERJ']