In [None]:
import pandas as pd
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
# pd.set_option('display.width', None)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell
pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', 120)
pd.set_option('display.float_format', '{:.4f}'.format)

In [None]:
from config import date_str, DOWNLOAD_DIR, DEST_DIR

path_data = '..\data\df_finviz_n_ratios_merged.parquet'
path_corr = '..\data\df_corr_emv_matrix.parquet'
path_cov = '..\data\df_cov_emv_matrix.parquet'
path_output = f'..\picks\{date_str}_portf.txt'

In [None]:
import pandas as pd

def get_column_values_above_threshold(df, column_name='Avg Volume, M', threshold=1):
  """
  Analyzes the number and percentage of values in a DataFrame column that are above a specified threshold,
  and returns the filtered DataFrame.

  Args:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the column to analyze. Defaults to 'Avg Volume, M'.
    threshold (float): The threshold value to compare against. Defaults to 1.00.

  Returns:
    pd.DataFrame: A DataFrame containing only the rows where the specified column's value is above the threshold.
  """
  
  count_before = len(df)
  above_threshold_df = df[df[column_name] > threshold]
  count_after = len(above_threshold_df)
  percentage = (count_after / len(df)) * 100

  print(f"count_before: {count_before}")
  print(f"count_after above threshold ({threshold}): {count_after}")
  print(f"Percentage above threshold ({threshold}): {percentage:.2f}%")

  return above_threshold_df


In [None]:
df_data = pd.read_parquet(path_data)

# liquidity filter, Avg Volume, M > 0.75M
df_data = get_column_values_above_threshold(df_data, column_name='Avg Volume, M', threshold=0.75)

# Drop specified columns with NaNs in df_data
df_data = df_data.drop(['All-Time High %', 'All-Time Low %', 'Dividend %'], axis=1)

df_corr = pd.read_parquet(path_corr)
df_cov = pd.read_parquet(path_cov)

print(f'\ndf_cov.shape: {df_cov.shape}')
display(df_cov.head())

print(f'\ndf_corr.shape: {df_corr.shape}')
display(df_corr.head())

print(f'\ndf_data.shape: {df_data.shape}')
display(df_data.head())
display((df_data.describe()))

In [None]:
# Check df_corr
has_nan_corr = df_corr.isnull().any().any()
print(f"Are there any NaNs in df_corr? {has_nan_corr}")

# Check df_cov
has_nan_cov = df_cov.isnull().any().any()
print(f"Are there any NaNs in df_cov? {has_nan_cov}")

In [None]:
df_data.columns

In [None]:
import logging

output_log = 'output.log'
logging.basicConfig(filename=output_log, level=logging.DEBUG, encoding='utf-8')


In [None]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

def portfolio_optimizer_60days(df_data, df_corr, df_cov, num_clusters=60, num_tickers=100, output_file="portfolio_output.txt"):
    """
    Selects the top N tickers based on a composite score and optimizes a portfolio using cluster analysis.

    Args:
        df_data (pd.DataFrame): DataFrame containing stock data.
        df_corr (pd.DataFrame): DataFrame containing the correlation matrix.
        df_cov (pd.DataFrame): DataFrame containing the covariance matrix.
        num_clusters (int): The number of clusters to form.
        num_tickers (int): The number of top tickers to select.
        output_file (str): The name of the log file.

    Returns:
        tuple: (pd.DataFrame with intermediate values, pd.DataFrame with cluster statistics, 
                pd.DataFrame with detailed cluster information)

    Complete fixed portfolio optimizer with:
    - Momentum decay weights
    - RSI penalty
    - Proper DataFrame handling
    """
    # Configure logging to write to both console and file
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(output_file, mode='w'),
            logging.StreamHandler()
        ]
    )

    try:
        logging.info("Starting optimized portfolio selection")

        # ===== STAGE 1: DATA PREPARATION =====
        # Convert numeric columns
        numeric_cols = ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price']
        for col in numeric_cols:
            if col in df_data.columns:
                df_data[col] = pd.to_numeric(
                    df_data[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True),
                    errors='coerce'
                )

        # ===== SCORING CONFIGURATION =====
        time_horizons = [3, 5, 10, 15, 30, 60]
        feature_weights = {
            'sharpe': 0.20,
            'sortino': 0.20,
            'omega': 0.15,
            'momentum': 0.20,  # Reduced from 0.25
            'sma': 0.15,
            'volatility': -0.10,
            'rsi': -0.05  # New RSI penalty
        }

        # Column definitions
        sharpe_cols = [f'Sharpe {days}d' for days in time_horizons]
        sortino_cols = [f'Sortino {days}d' for days in time_horizons]
        omega_cols = [f'Omega {days}d' for days in time_horizons]
        momentum_cols = ['Perf 3D %', 'Perf Week %', 'Perf Month %', 'Perf Quart %']
        sma_cols = ['SMA20 %', 'SMA50 %', 'SMA200 %']
        volatility_cols = ['Volatility W %', 'Volatility M %']
        momentum_weights = np.array([0.4, 0.3, 0.2, 0.1])  # Decay weights

        # Data cleaning
        required_cols = (sharpe_cols + sortino_cols + omega_cols +
                        momentum_cols + sma_cols + volatility_cols + ['RSI'])
        clean_mask = df_data[required_cols].notna().all(axis=1)
        df_clean = df_data.loc[clean_mask].copy()

        if len(df_clean) < num_tickers:
            raise ValueError(f"Only {len(df_clean)} valid tickers after cleaning")

        # ===== SCORE CALCULATION =====
        def calculate_weighted_score(df):
            components = {}
            intermediate_values = {}
            raw_values = {}

            # Risk-adjusted metrics
            for category, cols in [('sharpe', sharpe_cols),
                                 ('sortino', sortino_cols),
                                 ('omega', omega_cols)]:
                z_scores = df[cols].apply(lambda x: (x - x.mean()) / x.std())
                intermediate_values[f'{category}_zscores'] = z_scores
                raw_values[f'{category}_raw'] = df[cols]
                components[category] = z_scores.mean(axis=1) * feature_weights[category]

            # Momentum with decay weights
            momentum_zscores = df[momentum_cols].apply(lambda x: (x - x.mean()) / x.std())
            intermediate_values['momentum_zscores'] = momentum_zscores
            raw_values['momentum_raw'] = df[momentum_cols]
            components['momentum'] = (momentum_zscores @ momentum_weights) * feature_weights['momentum']

            # RSI penalty
            if 'RSI' in df.columns:
                rsi_penalty = pd.Series(
                    np.where(df['RSI'] > 70, feature_weights['rsi'], 0),
                    index=df.index
                )
                components['rsi'] = rsi_penalty
            else:
                components['rsi'] = pd.Series(0, index=df.index)

            # Technical indicators
            for category, cols in [('sma', sma_cols),
                                 ('volatility', volatility_cols)]:
                z_scores = df[cols].apply(lambda x: (x - x.mean()) / x.std())
                intermediate_values[f'{category}_zscores'] = z_scores
                raw_values[f'{category}_raw'] = df[cols]
                components[category] = z_scores.mean(axis=1) * feature_weights[category]

            composite_score = pd.concat(components, axis=1).sum(axis=1)
            return composite_score, intermediate_values, raw_values

        # Calculate scores
        df_clean['composite_score'], intermediates, raw_vals = calculate_weighted_score(df_clean)

        # ===== STAGE 2: CLUSTERING =====
        # top_n = df_clean.nlargest(num_tickers, 'composite_score')
        # top_n_tickers = top_n.index.tolist()
        # corr_subset = df_corr.loc[top_n_tickers, top_n_tickers]
        # distance_matrix = 1 - np.abs(corr_subset)
        # np.fill_diagonal(distance_matrix.values, 0)
        # linkage_matrix = linkage(squareform(distance_matrix), method='ward')
        # clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')

        # ===== STAGE 2: CLUSTERING =====
        top_n = df_clean.nlargest(num_tickers, 'composite_score')
        top_n_tickers = top_n.index.tolist()
        corr_subset = df_corr.loc[top_n_tickers, top_n_tickers]

        # Ensure the correlation matrix is symmetric
        corr_subset = (corr_subset + corr_subset.T) / 2  # Force symmetry

        distance_matrix = 1 - np.abs(corr_subset)
        np.fill_diagonal(distance_matrix.values, 0)

        # Convert to condensed distance matrix and verify symmetry
        condensed_dist = squareform(distance_matrix)
        if not np.allclose(condensed_dist, condensed_dist.T, rtol=1e-05, atol=1e-08):
            condensed_dist = (condensed_dist + condensed_dist.T) / 2  # Force symmetry

        linkage_matrix = linkage(condensed_dist, method='ward')
        clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')








        # ===== STAGE 3: PORTFOLIO SELECTION =====
        cluster_df = pd.DataFrame({
            'ticker': top_n_tickers,
            'cluster': clusters,
            'score': top_n['composite_score']
        }).merge(
            df_clean[['Price', 'MktCap AUM, M', 'Volatility M %']],
            left_on='ticker',
            right_index=True
        )

        epsilon = 1e-6
        cluster_df = cluster_df.assign(
            variance=cluster_df['ticker'].apply(lambda x: df_cov.loc[x, x]),
            risk_adj_score=lambda x: x['score'] / (np.sqrt(x['variance']) + epsilon),
            volatility=lambda x: np.sqrt(x['variance'])
        )

        # Prepare outputs
        detailed_clusters_df = cluster_df.sort_values(['cluster', 'risk_adj_score'], 
                                                    ascending=[True, False])
        detailed_clusters_df = detailed_clusters_df[['cluster', 'ticker', 'score', 
                                                  'risk_adj_score', 'volatility']]
        detailed_clusters_df.columns = ['Cluster_ID', 'Ticker', 'Raw_Score', 
                                     'Risk_Adj_Score', 'Volatility']
        
        cluster_stats_df = cluster_df.groupby('cluster').agg(
            Size=('ticker', 'count'),
            Avg_Correlation=('ticker', lambda x: corr_subset.loc[x,x].values.mean()),
            Avg_Raw_Score=('score', 'mean'),
            Avg_Risk_Adj_Score=('risk_adj_score', 'mean'),
            Avg_Volatility=('volatility', 'mean')
        ).reset_index().round(2)
        cluster_stats_df.columns = ['Cluster_ID', 'Size', 'Avg_Correlation', 
                                  'Avg_Raw_Score', 'Avg_Risk_Adj_Score', 'Avg_Volatility']

        # ===== FIXED ZSCORE_DF CONSTRUCTION =====
        # Convert intermediates to DataFrame
        intermediates_df = pd.DataFrame()
        for key, values in intermediates.items():
            if isinstance(values, pd.DataFrame):
                intermediates_df = pd.concat([intermediates_df, values.add_prefix(f'{key}_')], axis=1)
            else:
                intermediates_df[key] = pd.Series(values, index=df_clean.index)

        # Convert raw values to DataFrame
        raw_vals_df = pd.DataFrame()
        for key, values in raw_vals.items():
            if isinstance(values, pd.DataFrame):
                raw_vals_df = pd.concat([raw_vals_df, values.add_prefix(f'{key}_')], axis=1)
            else:
                raw_vals_df[key] = pd.Series(values, index=df_clean.index)

        # Construct final output
        zscore_df = pd.concat([
            intermediates_df,
            raw_vals_df,
            df_clean['composite_score'].rename('composite_score'),
            cluster_df.set_index('ticker')['cluster'].reindex(df_clean.index)
        ], axis=1)

        # Write results to file
        with open(output_file, 'a') as f:
            f.write("\n\n=== CLUSTER STATISTICS ===\n")
            cluster_stats_df.to_string(f, index=False)
            f.write("\n\n=== DETAILED CLUSTERS ===\n")
            detailed_clusters_df.to_string(f, index=False)
            f.write("\n\n=== SCORING DETAILS (1-20) ===\n")
            zscore_df.head(20).to_string(f)  # Only write top 20 rows for brevity

        logging.info("Portfolio optimization completed successfully")
        return zscore_df, cluster_stats_df, detailed_clusters_df

    except Exception as e:
        logging.error(f"Optimization failed: {str(e)}", exc_info=True)
        raise



In [None]:
print("\n🔧 Running portfolio optimizer...")

zscore_df, cluster_stats_df, detailed_clusters_df = portfolio_optimizer_60days(
    df_data, 
    df_corr,
    df_cov,
    num_clusters=60,
    num_tickers=len(df_data),
    output_file=path_output,
)

print(f"\n✅ Execution completed.\nSee output save to: {path_output}\nOutput log: {output_log}")


In [None]:
logging.shutdown()

In [None]:
import pandas as pd
import numpy as np

def select_stocks_from_clusters(cluster_stats_df, detailed_clusters_df, 
                               num_clusters=3, stocks_per_cluster=5,
                               min_cluster_size=5, volatility_threshold=0.3):
    """
    Pipeline to select stocks from better performing clusters
    
    Parameters:
    - cluster_stats_df: DataFrame with cluster statistics
    - detailed_clusters_df: DataFrame with detailed cluster information
    - num_clusters: Number of top clusters to select
    - stocks_per_cluster: Number of stocks to select from each cluster
    - min_cluster_size: Minimum size for a cluster to be considered
    - volatility_threshold: Maximum allowed volatility for selected stocks
    
    Returns:
    - selected_stocks: DataFrame of selected stocks with their metrics
    - cluster_performance: DataFrame of selected clusters with their metrics
    """
    
    # ===== 1. Filter and Rank Clusters =====
    # Filter clusters by minimum size
    qualified_clusters = cluster_stats_df[cluster_stats_df['Size'] >= min_cluster_size].copy()
    
    # Calculate composite cluster score (weighted average of raw score and correlation)
    qualified_clusters['Composite_Cluster_Score'] = (
        0.7 * qualified_clusters['Avg_Raw_Score'] +  # Using Raw Score for selection
        0.3 * (1 - qualified_clusters['Avg_Correlation'])  # Prefer less correlated clusters
    )

    # Sort clusters by composite score
    ranked_clusters = qualified_clusters.sort_values('Composite_Cluster_Score', ascending=False)
    
    # Select top N clusters
    selected_clusters = ranked_clusters.head(num_clusters)
    cluster_ids = selected_clusters['Cluster_ID'].tolist()
    
    # ===== 2. Select Stocks from Each Cluster =====
    selected_stocks_list = []
    
    for cluster_id in cluster_ids:
        # Get stocks from this cluster
        cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id]
        
        # Filter by volatility threshold
        cluster_stocks = cluster_stocks[cluster_stocks['Volatility'] <= volatility_threshold]
        
        if len(cluster_stocks) > 0:
            # Sort by risk-adjusted score and select top stocks
            top_stocks = cluster_stocks.sort_values('Risk_Adj_Score', ascending=False).head(stocks_per_cluster)
            
            # Add cluster metrics to each stock
            cluster_metrics = selected_clusters[selected_clusters['Cluster_ID'] == cluster_id].iloc[0]
            for col in ['Composite_Cluster_Score', 'Avg_Correlation', 'Avg_Volatility', 
                       'Avg_Raw_Score', 'Avg_Risk_Adj_Score']:
                top_stocks[f'Cluster_{col}'] = cluster_metrics[col]
            
            selected_stocks_list.append(top_stocks)
    
    # Combine all selected stocks
    if selected_stocks_list:
        selected_stocks = pd.concat(selected_stocks_list)
        
        # Calculate position sizing weights (based on risk-adjusted scores)
        selected_stocks['Weight'] = (selected_stocks['Risk_Adj_Score'] / 
                                   selected_stocks['Risk_Adj_Score'].sum())
        
        # Sort by cluster then by score
        selected_stocks = selected_stocks.sort_values(['Cluster_ID', 'Risk_Adj_Score'], 
                                                   ascending=[True, False])
    else:
        selected_stocks = pd.DataFrame()
        print("Warning: No stocks met selection criteria")
    
    # ===== 3. Prepare Enhanced Output Reports =====
    # Cluster performance report - now showing both score types
    cluster_performance = selected_clusters.copy()
    cluster_performance['Stocks_Selected'] = cluster_performance['Cluster_ID'].apply(
        lambda x: len(selected_stocks[selected_stocks['Cluster_ID'] == x]) if not selected_stocks.empty else 0)
    
    # Add diversification metrics
    if not selected_stocks.empty:
        cluster_performance['Intra_Cluster_Diversification'] = 1 - cluster_performance['Avg_Correlation']
    
    return selected_stocks, cluster_performance


In [None]:
# Run the selection pipeline
selected_stocks, cluster_performance = select_stocks_from_clusters(
    cluster_stats_df=cluster_stats_df,
    detailed_clusters_df=detailed_clusters_df,
    num_clusters=3,
    stocks_per_cluster=3,
    min_cluster_size=5,
    volatility_threshold=0.3
)

# Enhanced Output Display
print("\n=== CLUSTER SELECTION CRITERIA ===")
print("* Using Avg_Raw_Score for cluster selection")
print("* Using Risk_Adj_Score for stock selection within clusters")
print(f"* Selected top {len(cluster_performance)} clusters from {len(cluster_stats_df)} total")

print("\n=== SELECTED CLUSTERS (RANKED BY RAW SCORE) ===")
display_cols = ['Cluster_ID', 'Size', 'Avg_Raw_Score', 'Avg_Risk_Adj_Score', 
                'Avg_Correlation', 'Avg_Volatility', 'Composite_Cluster_Score',
                'Stocks_Selected', 'Intra_Cluster_Diversification']
print(cluster_performance[display_cols].sort_values('Avg_Raw_Score', ascending=False).to_string(index=False))

# NEW: Print top 8 stocks by Raw_Score for each selected cluster
print("\n=== TOP STOCKS BY RAW SCORE PER CLUSTER ===")
for cluster_id in cluster_performance['Cluster_ID']:
    cluster_stocks = detailed_clusters_df[detailed_clusters_df['Cluster_ID'] == cluster_id]
    top_raw = cluster_stocks.nlargest(8, 'Raw_Score')[['Ticker', 'Raw_Score', 'Risk_Adj_Score', 'Volatility']]
    
    print(f"\nCluster {cluster_id} - Top 8 by Raw Score:")
    print(top_raw.to_string(index=False))
    print(f"Cluster Avg Raw Score: {cluster_performance[cluster_performance['Cluster_ID'] == cluster_id]['Avg_Raw_Score'].values[0]:.2f}")
    print(f"Cluster Avg Risk Adj Score: {cluster_performance[cluster_performance['Cluster_ID'] == cluster_id]['Avg_Risk_Adj_Score'].values[0]:.2f}")

print("\n=== FINAL SELECTED STOCKS (BY RISK-ADJ SCORE) ===")
print("* Stocks actually selected based on Risk_Adj_Score within each cluster")
print("* Position weights assigned based on Risk_Adj_Score")

available_cols = [col for col in ['Cluster_ID', 'Ticker', 'Raw_Score', 'Risk_Adj_Score', 
                                'Volatility', 'Weight', 'Cluster_Avg_Raw_Score',
                                'Cluster_Avg_Risk_Adj_Score'] 
                  if col in selected_stocks.columns]

print(selected_stocks[available_cols].sort_values(['Cluster_ID', 'Risk_Adj_Score'], 
                                                ascending=[True, False]).to_string(index=False))

# Calculate and print portfolio summary
if not selected_stocks.empty:
    print("\n=== PORTFOLIO SUMMARY ===")
    print(f"Total Stocks Selected: {len(selected_stocks)}")
    print(f"Average Raw Score: {selected_stocks['Raw_Score'].mean():.2f}")
    print(f"Average Risk-Adjusted Score: {selected_stocks['Risk_Adj_Score'].mean():.2f}")
    print(f"Average Volatility: {selected_stocks['Volatility'].mean():.2f}")
    print("\nCluster Distribution:")
    print(selected_stocks['Cluster_ID'].value_counts().to_string())

In [None]:
selected_stocks

In [None]:
# Step 1: Drop Tickers 
drop_tickers = ['BECN']
selected_stocks = selected_stocks[~selected_stocks['Ticker'].isin(drop_tickers)].copy()

# Step 1: Drop Cluster 
drop_cluster = 0
selected_stocks = selected_stocks[selected_stocks['Cluster_ID'] != drop_cluster].copy()

# Step 2: Recalculate weights based on remaining stocks' Risk_Adj_Scores
selected_stocks['Weight'] = selected_stocks['Risk_Adj_Score'] / selected_stocks['Risk_Adj_Score'].sum()

# Step 3: Display the updated portfolio
# print("\n=== UPDATED PORTFOLIO (CLUSTER 46 REMOVED) ===")
print(f"\n=== UPDATED PORTFOLIO (CLUSTER {drop_cluster}, TICKER {drop_tickers} REMOVED) ===")
print(selected_stocks[['Cluster_ID', 'Ticker', 'Raw_Score', 'Risk_Adj_Score', 'Weight', 'Volatility']]
      .sort_values(['Weight', 'Cluster_ID'], ascending=[False, False])
      .to_string(index=False))

# Optional: Print weight redistribution summary
original_total = 1.0
new_total = selected_stocks['Weight'].sum()
print(f"\nWeights redistributed from Cluster {drop_cluster} & Ticker {drop_tickers}: {original_total - new_total:.1%}")
print(f"New total weights sum to: {new_total:.0%}")

In [None]:
# Step 1: Drop Tickers 
drop_tickers = ['BECN']
selected_stocks = selected_stocks[~selected_stocks['Ticker'].isin(drop_tickers)].copy()

# Step 1: Drop Cluster 
drop_cluster = 0
selected_stocks = selected_stocks[selected_stocks['Cluster_ID'] != drop_cluster].copy()

# Step 2: Recalculate weights based on remaining stocks' Risk_Adj_Scores
selected_stocks['Weight'] = selected_stocks['Risk_Adj_Score'] / selected_stocks['Risk_Adj_Score'].sum()

# Step 3: Display the updated portfolio
print(f"\n=== UPDATED PORTFOLIO (CLUSTER {drop_cluster}, TICKER {drop_tickers} REMOVED) ===")
sorted_df = selected_stocks[['Cluster_ID', 'Ticker', 'Raw_Score', 'Risk_Adj_Score', 'Weight', 'Volatility']] \
              .sort_values(['Weight', 'Cluster_ID'], ascending=[False, False])

print(sorted_df.to_string(index=False))

# Step 4: Extract the 'Ticker' column from the sorted DataFrame
ticker_order = sorted_df['Ticker'].tolist()


# Optional: Print weight redistribution summary
original_total = 1.0
new_total = selected_stocks['Weight'].sum()
print(f"\nWeights redistributed from Cluster {drop_cluster} & Ticker {drop_tickers}: {original_total - new_total:.1%}")
print(f"New total weights sum to: {new_total:.0%}")

In [None]:
display(df_data.loc[ticker_order])

In [None]:
selected_stocks.to_parquet(f'..\picks\{date_str}_selected_stocks.parquet')
cluster_stats_df.to_parquet(f'..\picks\{date_str}_cluster_stats_df.parquet')
detailed_clusters_df.to_parquet(f'..\picks\{date_str}_detailed_clusters_df.parquet')

In [None]:
import matplotlib.pyplot as plt

PLOT_COLORS = ['lightgreen', 'skyblue', 'salmon', 'gold', 'orchid',
                     'lightcoral', 'deepskyblue', 'mediumpurple', 'darkseagreen', 'tan']

# Create a figure with subplots
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, 1, figsize=(12, 12))
fig.suptitle('Cluster Statistics Analysis')

# Plot 1 Average Risk-Adjusted Score
ax1.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Risk_Adj_Score'], color='skyblue')
# ax1.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Raw_Score'], color='lightgreen')
ax1.set_title('Average Risk-Adjusted Scores by Cluster')
ax1.set_xlabel('Cluster_ID')
ax1.set_ylabel('Average Risk-Adjusted Score')

# Plot 2 Average Raw Score
# ax2.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Risk_Adj_Score'], color='lightgreen')
ax2.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Raw_Score'], color='lightgreen')
ax2.set_title('Average RawScores by Cluster')
ax2.set_xlabel('Cluster_ID')
ax2.set_ylabel('Average Raw Score')

# Plot 3: Average Correlation
ax3.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Correlation'], color='salmon')
ax3.set_title('Average Correlation within Clusters')
ax3.set_xlabel('Cluster_ID')
ax3.set_ylabel('Average Correlation')

# Plot 4: Average Volatility
ax4.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Avg_Volatility'], color='gold')
ax4.set_title('Average Volatility within Clusters')
ax4.set_xlabel('Cluster_ID')
ax4.set_ylabel('Average Volatility')

# Plot 5 Cluster Size
ax5.bar(cluster_stats_df['Cluster_ID'], cluster_stats_df['Size'], color='orchid')
ax5.set_title('Cluster Sizes')
ax5.set_xlabel('Cluster_ID')
ax5.set_ylabel('Number of Members')

plt.tight_layout()
plt.show()

In [None]:
# Calculate summary statistics for cluster_stats_df
stats_summary = pd.DataFrame({
    'Count': cluster_stats_df.count(numeric_only=True),
    'Sum': cluster_stats_df.sum(numeric_only=True),
    'Mean': cluster_stats_df.mean(numeric_only=True),
    'Std': cluster_stats_df.std(numeric_only=True),
    'Mean+1Std (68%)': cluster_stats_df.mean(numeric_only=True) + cluster_stats_df.std(numeric_only=True),
    'Mean-1Std (68%)': cluster_stats_df.mean(numeric_only=True) - cluster_stats_df.std(numeric_only=True),
    'Mean+2Std (95%)': cluster_stats_df.mean(numeric_only=True) + 2*cluster_stats_df.std(numeric_only=True),
    'Mean-2Std (95%)': cluster_stats_df.mean(numeric_only=True) - 2*cluster_stats_df.std(numeric_only=True),
    'Min': cluster_stats_df.min(numeric_only=True),
    'Max': cluster_stats_df.max(numeric_only=True),
})

print("Summary Statistics for Cluster Data:")
display(stats_summary.round(4))

In [None]:
# Sort by Avg_Raw_Score in descending order
sorted_by_Avg_Raw_Score = cluster_stats_df.sort_values('Avg_Raw_Score', ascending=False)
print(f'sorted_by_Avg_Raw_Score')
display(sorted_by_Avg_Raw_Score.head(10))