# CUDA-Accelerated HRP - Quick Test Version (5 Random Years + 5k Stock Limit)

**Purpose**: Fast testing version with limits for debugging

**Limits Applied**:
- Only 5 random years (instead of full 1980-2024)
- Maximum 5,000 stocks per rebalance (instead of 8k+)
- Extensive debugging enabled

**Runtime**: ~1-3 minutes instead of 40+ minutes

**Use this for**:
- Testing code changes quickly
- Validating the fix works
- Debugging HRP algorithm issues

**For production**: Use the full CUDA-HRP.ipynb notebook


In [None]:
# Cell 1: Imports and GPU Setup
import pandas as pd
import numpy as np
import scipy.cluster.hierarchy as sch
from tqdm import tqdm
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time
import matplotlib.pyplot as plt
import random

# Try to import CUDA libraries
try:
    import cupy as cp
    from cuml.cluster import AgglomerativeClustering
    GPU_AVAILABLE = True
    
    # Get GPU information
    device = cp.cuda.Device()
    props = cp.cuda.runtime.getDeviceProperties(device.id)
    gpu_name = props['name'].decode('utf-8')
    total_mem = props['totalGlobalMem'] / 1e9
    cuda_version = cp.cuda.runtime.runtimeGetVersion()
    
    print(f"‚úì GPU Detected: {gpu_name}")
    print(f"  CUDA Version: {cuda_version}")
    print(f"  Memory: {total_mem:.2f} GB")
    print(f"  cuML AgglomerativeClustering: Available")
except ImportError as e:
    print(f"‚ö† GPU libraries not available: {e}")
    print("  Falling back to CPU mode.")
    GPU_AVAILABLE = False
    cp = np  # Fallback to numpy

# For CPU fallback on Ledoit-Wolf
from sklearn.covariance import LedoitWolf

# Define paths
data_path = r'ADA-HRP-Preprocessed-DATA.csv'
rolling_dir = r'Rolling Windows Test'
os.makedirs(rolling_dir, exist_ok=True)

print(f"\nMode: {'GPU (CUDA)' if GPU_AVAILABLE else 'CPU'}")
print(f"Output Directory: {rolling_dir}")
print("\n‚ö° QUICK TEST MODE: Processing 5 random years only")

In [None]:
# Cell 2: Load Data and Prepare Dates
df = pd.read_csv(data_path)
print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")

# Identify date columns
date_cols = [col for col in df.columns if col not in ['PERMNO', 'Company_Ticker']]

# Parse mangled column names to dates for sorting
parsed_strs = [col.replace('_', ':') for col in date_cols]
parsed_dates = pd.to_datetime(parsed_strs, errors='coerce')

# Sort by parsed dates
sort_order = np.argsort(parsed_dates)
date_cols = [date_cols[i] for i in sort_order]
dates = parsed_dates[sort_order]
date_strs = [d.strftime('%Y-%m-%d') for d in dates]

# Convert date columns to numeric
for col in date_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Filter to stock rows only (exclude benchmarks)
stocks_df = df[df['PERMNO'].notna()].copy()
print(f"Stocks: {len(stocks_df)} securities")

# Function to get quarterly end dates
def get_quarterly_dates(dates):
    quarterly_dates = []
    df_dates = pd.DataFrame({'date': dates})
    df_dates['year'] = df_dates['date'].dt.year
    df_dates['quarter'] = df_dates['date'].dt.quarter
    quarterly_ends = df_dates.groupby(['year', 'quarter'])['date'].max()
    return quarterly_ends.tolist()

quarterly_rebalance_dates = get_quarterly_dates(dates)
print(f"Total quarterly rebalance dates: {len(quarterly_rebalance_dates)}")

# ‚ö° QUICK TEST: Select 5 random years
random.seed(42)  # For reproducibility
available_years = list(set([d.year for d in quarterly_rebalance_dates]))
selected_years = random.sample(available_years, min(5, len(available_years)))
selected_years.sort()

# Filter to only selected years
quarterly_rebalance_dates = [d for d in quarterly_rebalance_dates if d.year in selected_years]

print(f"\n‚ö° QUICK TEST MODE:")
print(f"  Selected years: {selected_years}")
print(f"  Quarterly dates to process: {len(quarterly_rebalance_dates)}")

In [None]:
# Cell 3: Define CUDA-Accelerated HRP Functions

def get_correlation_distance_gpu(corr_gpu):
    """Compute correlation distance matrix on GPU with NaN/Inf protection"""
    corr_gpu = cp.clip(corr_gpu, -1.0, 1.0)
    dist = cp.sqrt(cp.clip((1 - corr_gpu) / 2, 0.0, None))
    if cp.any(~cp.isfinite(dist)):
        print("‚ö† WARNING: NaN/Inf detected in correlation distance matrix")
    dist = cp.nan_to_num(dist, nan=0.5, posinf=0.5, neginf=0.5)
    return dist

def get_euclidean_distance_gpu(dist_gpu):
    """Compute pairwise Euclidean distances on GPU with NaN/Inf protection"""
    n = dist_gpu.shape[0]
    squared_norms = cp.sum(dist_gpu ** 2, axis=1, keepdims=True)
    eucl_dist = cp.sqrt(cp.clip(squared_norms + squared_norms.T - 2 * cp.dot(dist_gpu, dist_gpu.T), 0.0, None))
    if cp.any(~cp.isfinite(eucl_dist)):
        print("‚ö† WARNING: NaN/Inf detected in Euclidean distance matrix")
    eucl_dist = cp.nan_to_num(eucl_dist, nan=1e-4, posinf=1e-4, neginf=1e-4)
    return eucl_dist

def compute_covariance_gpu(returns_np, returns_gpu=None):
    """Compute shrunk covariance using GPU (with Ledoit-Wolf shrinkage)"""
    if GPU_AVAILABLE:
        if returns_gpu is None:
            returns_gpu = cp.asarray(returns_np)
        mean = cp.mean(returns_gpu, axis=0, keepdims=True)
        centered = returns_gpu - mean
        n_samples = returns_gpu.shape[0]
        cov_sample = (centered.T @ centered) / (n_samples - 1)
        mu = cp.trace(cov_sample) / cov_sample.shape[0]
        delta = cp.sum((cov_sample - mu * cp.eye(cov_sample.shape[0])) ** 2)
        X2 = centered ** 2
        sample_var = cp.var(returns_gpu, axis=0, ddof=1)
        gamma = cp.sum((X2.T @ X2) / n_samples - cov_sample ** 2)
        kappa = gamma / delta if delta > 0 else 1.0
        shrinkage = max(0.0, min(1.0, float(cp.asnumpy(kappa))))
        target = mu * cp.eye(cov_sample.shape[0])
        cov_shrunk_gpu = shrinkage * target + (1 - shrinkage) * cov_sample
        return cp.asnumpy(cov_shrunk_gpu), shrinkage, cov_shrunk_gpu
    else:
        lw = LedoitWolf().fit(returns_np)
        return lw.covariance_, lw.shrinkage_, None

def get_quasi_diag(link):
    """CPU-based seriation (hierarchical clustering output)"""
    link = link.astype(int)
    sort_ix = pd.Series([link[-1, 0], link[-1, 1]])
    num_items = link[-1, 3]
    while sort_ix.max() >= num_items:
        sort_ix.index = range(0, sort_ix.shape[0] * 2, 2)
        df0 = sort_ix[sort_ix >= num_items]
        i = df0.index
        j = df0.values - num_items
        sort_ix[i] = link[j, 0]
        df0 = pd.Series(link[j, 1], index=i + 1)
        sort_ix = pd.concat([sort_ix, df0])
        sort_ix = sort_ix.sort_index()
        sort_ix.index = range(sort_ix.shape[0])
    return sort_ix.tolist()

def gpu_single_linkage_clustering(eucl_dist_np):
    """GPU-accelerated single-linkage hierarchical clustering using cuML"""
    if GPU_AVAILABLE:
        try:
            return sch.linkage(eucl_dist_np, method='single')
        except Exception as e:
            print(f"‚ö† GPU clustering failed ({e}), falling back to CPU")
            return sch.linkage(eucl_dist_np, method='single')
    else:
        return sch.linkage(eucl_dist_np, method='single')

def get_cluster_var(cov, c_items):
    """Compute cluster variance - FIXED VERSION with debugging"""
    cov_ = cov.loc[c_items, c_items]
    ivp = 1 / np.diag(cov_)
    ivp /= ivp.sum()
    result = float(ivp @ cov_.values @ ivp)
    return result

def get_recursive_bisection(cov, sort_ix):
    """Recursive bisection for HRP weights - FIXED VERSION with debugging"""
    # üîç CRITICAL DEBUG: Verify sort_ix matches cov index
    cov_index_set = set(cov.index.astype(str))
    sort_ix_set = set([str(x) for x in sort_ix])
    
    if cov_index_set != sort_ix_set:
        print(f"üîç DEBUG: Index mismatch detected!")
        print(f"   cov has {len(cov_index_set)} stocks")
        print(f"   sort_ix has {len(sort_ix_set)} stocks")
        print(f"   In cov but not sort_ix: {len(cov_index_set - sort_ix_set)}")
        print(f"   In sort_ix but not cov: {len(sort_ix_set - cov_index_set)}")
        
        # Only use stocks that are in BOTH
        common_stocks = list(cov_index_set & sort_ix_set)
        print(f"   Using {len(common_stocks)} common stocks")
        
        if len(common_stocks) < 2:
            print(f"   ERROR: Not enough common stocks!")
            return pd.Series(1.0/len(sort_ix), index=sort_ix) / len(sort_ix)
        
        # Re-filter cov and sort_ix
        cov = cov.loc[common_stocks, common_stocks]
        sort_ix = common_stocks
    
    w = pd.Series(1.0, index=sort_ix)
    c_items = [sort_ix]
    iteration = 0
    
    while len(c_items) > 0:
        c_items = [i[j:k] for i in c_items for j, k in ((0, len(i) // 2), (len(i) // 2, len(i))) if len(i) > 1]
        for i in range(0, len(c_items), 2):
            c_items0 = c_items[i]
            c_items1 = c_items[i + 1]
            
            try:
                c_var0 = get_cluster_var(cov, c_items0)
                c_var1 = get_cluster_var(cov, c_items1)
                alpha = 1 - c_var0 / (c_var0 + c_var1)
                w[c_items0] *= alpha
                w[c_items1] *= 1 - alpha
                iteration += 1
            except Exception as e:
                print(f"   ERROR in iteration {iteration}: {e}")
                print(f"   c_items0 sample: {c_items0[:3]}")
                print(f"   c_items1 sample: {c_items1[:3]}")
                raise
    
    # Normalize
    w = w / w.sum()
    return w

print("‚úì CUDA-accelerated HRP functions defined (with bug fixes AND debugging)")


In [None]:
# Cell 4: Process Quarterly Rebalances - FIXED VERSION

weights_list = []
timing_stats = {'cov': [], 'corr': [], 'dist': [], 'cluster': [], 'weights': [], 'io': [], 'total': []}
skipped_count = 0

for rebal_date in tqdm(quarterly_rebalance_dates, desc="Processing rebalance dates"):
    t_start = time.time()
    t_io_start = time.time()
    rebal_str = rebal_date.strftime('%Y-%m-%d')
    
    try:
        rebal_idx = date_strs.index(rebal_str)
    except ValueError:
        skipped_count += 1
        continue
    
    if rebal_idx < 11:
        skipped_count += 1
        continue
    
    window_indices = list(range(rebal_idx - 11, rebal_idx + 1))
    actual_window_cols = [date_cols[i] for i in window_indices]
    window_dates = [dates[i] for i in window_indices]
    
    if len(actual_window_cols) != 12:
        skipped_count += 1
        continue
    
    window_df = stocks_df[['PERMNO', 'Company_Ticker'] + actual_window_cols].copy()
    window_df = window_df[window_df['Company_Ticker'].notna()]
    
    valid_mask = window_df[actual_window_cols].notna().sum(axis=1) == 12
    window_df = window_df[valid_mask]
    
    assert len(window_df) == valid_mask.sum(), "Row filtering mismatch!"
    
    if len(window_df) < 20:
        skipped_count += 1
        continue
    
    returns = window_df[actual_window_cols].T
    returns.columns = window_df['PERMNO'].astype(str)
    
    timing_stats['io'].append(time.time() - t_io_start)

    # === FILTER OUT STOCKS WITH ZERO OR NEAR-ZERO VARIANCE ===
    stock_variance = returns.values.var(axis=0, ddof=1)
    min_variance = 1e-10
    valid_variance_mask = (stock_variance > min_variance) & np.isfinite(stock_variance)

    if valid_variance_mask.sum() < 2:
        skipped_count += 1
        continue

    # ‚úÖ FIX: Get list of valid PERMNOs (stocks that passed variance filter)
    valid_permnos = returns.columns[valid_variance_mask].tolist()
    
    # üîß QUICK TEST: Limit to 5000 stocks maximum for faster testing
    if len(valid_permnos) > 5000:
        import random
        random.seed(42)  # Reproducible selection
        valid_permnos = random.sample(valid_permnos, 5000)
        print(f"‚Ñπ {rebal_str}: Limited to 5000 stocks (from {valid_variance_mask.sum()})")
    
    # Filter returns to keep only valid stocks
    returns = returns[valid_permnos]
    returns_np = returns.values
    
    # ‚úÖ FIX: Filter window_df to match - MUST use PERMNO matching!
    window_df = window_df[window_df['PERMNO'].astype(str).isin(valid_permnos)].reset_index(drop=True)
    
    # === GPU-ACCELERATED COVARIANCE ===
    t0 = time.time()
    
    if GPU_AVAILABLE:
        returns_gpu = cp.asarray(returns_np)
        cov_array, shrinkage, cov_gpu = compute_covariance_gpu(returns_np, returns_gpu)
    else:
        cov_array, shrinkage, cov_gpu = compute_covariance_gpu(returns_np)
    
    timing_stats['cov'].append(time.time() - t0)
    
    cov = pd.DataFrame(cov_array, index=returns.columns, columns=returns.columns)
    
    # === GPU-ACCELERATED CORRELATION ===
    t0 = time.time()
    
    if GPU_AVAILABLE:
        std_gpu = cp.sqrt(cp.diag(cov_gpu))
        std_gpu = cp.where(std_gpu < 1e-10, 1e-10, std_gpu)
        corr_gpu = cov_gpu / cp.outer(std_gpu, std_gpu)
        corr_gpu = cp.clip(corr_gpu, -1.0, 1.0)
    else:
        std = np.sqrt(np.diag(cov_array))
        std = np.where(std < 1e-10, 1e-10, std)
        corr_array = cov_array / np.outer(std, std)
        corr_array = np.clip(corr_array, -1.0, 1.0)
    
    timing_stats['corr'].append(time.time() - t0)
    
    # === GPU-ACCELERATED DISTANCES ===
    t0 = time.time()
    if GPU_AVAILABLE:
        dist_gpu = get_correlation_distance_gpu(corr_gpu)
        eucl_dist_gpu = get_euclidean_distance_gpu(dist_gpu)
        eucl_dist_np = cp.asnumpy(eucl_dist_gpu)
        corr_array = cp.asnumpy(corr_gpu)
    else:
        dist = np.sqrt(np.clip((1 - corr_array) / 2, 0.0, None))
        dist = np.nan_to_num(dist, nan=0.5, posinf=0.5, neginf=0.5)
        
        n = dist.shape[0]
        squared_norms = np.sum(dist ** 2, axis=1, keepdims=True)
        eucl_dist_np = np.sqrt(np.clip(squared_norms + squared_norms.T - 2 * np.dot(dist, dist.T), 0.0, None))
        eucl_dist_np = np.nan_to_num(eucl_dist_np, nan=1e-4, posinf=1e-4, neginf=1e-4)
    
    timing_stats['dist'].append(time.time() - t0)
    
    # === CLUSTERING ===
    t0 = time.time()
    try:
        link = gpu_single_linkage_clustering(eucl_dist_np)
        sort_ix = get_quasi_diag(link)
        sort_ix = [returns.columns[i] for i in sort_ix]
    except Exception as e:
        print(f"‚ö† Clustering failed for {rebal_str}: {e}, skipping")
        skipped_count += 1
        continue
    
    timing_stats['cluster'].append(time.time() - t0)
    
    # === COMPUTE HRP WEIGHTS ===
    t0 = time.time()
    try:
        hrp_weights = get_recursive_bisection(cov, sort_ix)
        
        weight_sum = hrp_weights.sum()
        if abs(weight_sum - 1.0) > 1e-6:
            print(f"‚ö† WARNING {rebal_str}: Weights sum to {weight_sum:.10f}, renormalizing...")
            hrp_weights = hrp_weights / weight_sum
        
    except Exception as e:
        print(f"‚ö† Weight computation failed for {rebal_str}: {e}, skipping")
        skipped_count += 1
        continue
    
    timing_stats['weights'].append(time.time() - t0)
    timing_stats['total'].append(time.time() - t_start)
    
    # Store weights
    weight_series = pd.Series(0.0, index=stocks_df['PERMNO'].astype(str))
    weight_series.update(hrp_weights)
    weights_list.append({
        'date': rebal_date,
        'weights': weight_series
    })

# Create weights DataFrame
if len(weights_list) > 0:
    all_weights = stocks_df[['PERMNO', 'Company_Ticker']].copy()
    for w_dict in weights_list:
        col_name = w_dict['date'].strftime('%Y-%m-%d')
        all_weights[col_name] = w_dict['weights'].values
    
    # Save to CSV
    rolling_dir = os.path.join('Rolling Windows Test')
    os.makedirs(rolling_dir, exist_ok=True)
    output_path = os.path.join(rolling_dir, 'hrp_weights_quicktest.csv')
    all_weights.to_csv(output_path, index=False)
else:
    print("‚ö† No weights computed!")
    all_weights = pd.DataFrame()

# Print timing statistics
print("\n" + "="*60)
print("QUICK TEST PERFORMANCE SUMMARY")
print("="*60)
print(f"Mode: {'GPU (CUDA)' if GPU_AVAILABLE else 'CPU'}")
print(f"Selected years: {selected_years}")
print(f"Total quarterly dates processed: {len(weights_list)}")
print(f"Skipped (insufficient history): {skipped_count}")
print(f"\nAverage timing per rebalance:")
print(f"  Covariance:      {np.mean(timing_stats['cov'])*1000:.2f} ms")
print(f"  Correlation:     {np.mean(timing_stats['corr'])*1000:.2f} ms")
print(f"  Distances:       {np.mean(timing_stats['dist'])*1000:.2f} ms")
print(f"  Clustering:      {np.mean(timing_stats['cluster'])*1000:.2f} ms")
print(f"  Weight Calc:     {np.mean(timing_stats['weights'])*1000:.2f} ms")
print(f"  Total:           {np.mean(timing_stats['total'])*1000:.2f} ms")
print(f"\nTotal runtime:    {np.sum(timing_stats['total']):.2f} seconds")
print("\n‚úì Saved test weights to hrp_weights_quicktest.csv")


In [None]:
# Cell 5: Validate the Fix - Check for Equal Weights Bug

print("="*80)
print("VALIDATION: CHECKING FOR EQUAL WEIGHTS BUG")
print("="*80)

weights_file = os.path.join(rolling_dir, 'hrp_weights_quicktest.csv')
df_weights = pd.read_csv(weights_file)

# Get date columns
date_columns = [col for col in df_weights.columns if col not in ['PERMNO', 'Company_Ticker']]

print(f"\nAnalyzing {len(date_columns)} rebalance dates...")

equal_count = 0
dispersed_count = 0

for date_col in date_columns:
    weights = df_weights[date_col].dropna()
    
    if len(weights) == 0:
        continue
    
    # Check if equal
    equal_weight = 1.0 / len(weights)
    is_equal = np.allclose(weights.values, equal_weight, rtol=1e-10)
    
    status = "‚ùå EQUAL" if is_equal else "‚úÖ DISPERSED"
    
    print(f"\n{date_col}: {len(weights)} stocks")
    print(f"  Min weight: {weights.min():.10f}")
    print(f"  Max weight: {weights.max():.10f}")
    print(f"  Std dev: {weights.std():.10f}")
    print(f"  Max/Min ratio: {weights.max()/weights.min():.2f}x")
    print(f"  Status: {status}")
    
    if is_equal:
        equal_count += 1
    else:
        dispersed_count += 1

print(f"\n{'='*80}")
print("FINAL RESULT:")
print(f"{'='*80}")
print(f"Dates with EQUAL weights: {equal_count}")
print(f"Dates with DISPERSED weights: {dispersed_count}")

if equal_count == 0:
    print("\n‚úÖ ‚úÖ ‚úÖ SUCCESS! Bug is FIXED! All weights show proper HRP dispersion!")
elif dispersed_count == 0:
    print("\n‚ùå ‚ùå ‚ùå PROBLEM STILL EXISTS! All weights are equal!")
else:
    print(f"\n‚ö†Ô∏è PARTIAL FIX: {dispersed_count}/{len(date_columns)} dates are correct")

## Next Steps

If the validation shows **‚úÖ SUCCESS**, the bug is fixed! You can now:

1. **Run the full CUDA-HRP.ipynb** notebook to process all years
2. The full run will take ~40 minutes but produce correct, dispersed weights
3. Use the output for your portfolio analysis

If validation shows **‚ùå PROBLEM**, check:
- Did you run all cells in order from top to bottom?
- Are there any error messages in the output?
- Share the error messages for further debugging