In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
df_sentiment = pd.read_csv('./data/sentiment_data.csv')
df_regime = pd.read_csv('./data/pivot_regime.csv')
df_returns = pd.read_csv('./data/pivot_returns.csv')

In [3]:
df_sentiment.head()

Unnamed: 0,date,AAPL,AMZN,GOOG,MSFT,NFLX
0,2025-04-03,0.108453,0.152113,0.250044,0.086048,0.213562
1,2025-04-04,0.108453,0.152113,0.250044,0.086048,0.321464
2,2025-04-05,0.108453,0.152113,0.250044,0.086048,0.21542
3,2025-04-06,0.108453,0.152113,0.250044,0.086048,0.280937
4,2025-04-07,0.108453,0.152113,0.250044,0.086048,0.262796


In [5]:
df_regime.dropna(inplace=True)
returns = df_regime['Return']
returns

2       0.001073
3      -0.024641
4       0.008879
5      -0.003811
6      -0.027140
          ...   
2508    0.011478
2509    0.003176
2510   -0.013242
2511   -0.013263
2512   -0.007058
Name: Return, Length: 2511, dtype: float64

In [6]:
df_sentiment.dropna(inplace=True)
sentiment_scores = df_sentiment['api_sentiment_score']

aligned_sentiment = np.interp(
    np.linspace(0, 49, 2511),
    np.arange(50),
    sentiment_scores
)

scaler = MinMaxScaler()
sentiment_scaled = scaler.fit_transform(aligned_sentiment.reshape(-1,1)).flatten()


In [7]:
sentiment_scaled.shape

(2511,)

In [14]:
sentiment_weighted_returns = returns * sentiment_scaled

# Converting this to DataFrame
sentiment_weighted_df = pd.DataFrame(sentiment_weighted_returns)
sentiment_weighted_df

Unnamed: 0,Return
2,0.000716
3,-0.016394
4,0.005884
5,-0.002515
6,-0.017845
...,...
2508,0.006644
2509,0.001841
2510,-0.007692
2511,-0.007717


In [15]:
# Market Regime filtering
# Using only bullish regime for filtering (initially)
bullish_dates = df_regime[df_regime['Market Regime']=='Bullish'].index
filtered_returns = sentiment_weighted_df.loc[bullish_dates]

In [32]:
filtered_returns

Unnamed: 0,Return
8,0.016802
10,0.016835
24,0.014313
44,0.010099
47,0.009212
...,...
2441,0.023910
2454,0.010494
2458,0.009017
2476,0.011680


## HRP Implementation

In [18]:
# Correltaion matrix -> distance matrix
def correl_dist(corr):
    return ((1 - corr) / 2.) ** 0.5

In [19]:
# Compute inverse portfolio weights
def get_ivp(cov):
    ivp = 1. / np.diag(cov)
    ivp /= ivp.sum()
    return ivp

In [20]:
# Compute total variance of a given cluster
def get_cluster_var(cov, cluster_items):
    cov_ = cov.loc[cluster_items, cluster_items]
    w_ = get_ivp(cov_).reshape(-1, 1)
    return np.dot(np.dot(w_.T, cov_), w_)[0, 0]

In [21]:
# Reorder asset in quasi-diagonal form after hierarchical clustering
def get_quasi_diag(link):
    link = link.astype(int)
    sort_ix = pd.Series([link[-1, 0], link[-1, 1]])
    num_items = link[-1, 3]
    while sort_ix.max() >= num_items:
        sort_ix.index = range(0, sort_ix.shape[0] * 2, 2)
        df0 = sort_ix[sort_ix >= num_items]
        i = df0.index
        j = df0.values - num_items
        sort_ix[i] = link[j, 0]
        df1 = pd.Series(link[j, 1], index=i + 1)
        sort_ix = pd.concat([sort_ix, df1])
        sort_ix = sort_ix.sort_index()
    return sort_ix.tolist()


In [24]:
# Implement the full HRP algorithm
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

def get_hrp_weights(cov, corr):
    dist = correl_dist(corr)
    link = linkage(squareform(dist), 'single')
    sort_ix = get_quasi_diag(link)
    sorted_tickers = corr.index[sort_ix].tolist()

    hrp = pd.Series(1, index=sorted_tickers)
    clustered_items = [sorted_tickers]
    while len(clustered_items) > 0:
        clustered_items = [i[j:k] for i in clustered_items for j, k in ((0, len(i) // 2), (len(i) // 2, len(i))) if len(i) > 1]
        for i in range(0, len(clustered_items), 2):
            c_items0 = clustered_items[i]
            c_items1 = clustered_items[i + 1]
            var0 = get_cluster_var(cov, c_items0)
            var1 = get_cluster_var(cov, c_items1)
            alpha = 1 - var0 / (var0 + var1)
            hrp[c_items0] *= alpha
            hrp[c_items1] *= 1 - alpha
    return hrp / hrp.sum()

In [33]:
# Compute HRP weights on filtered returns
cov_matrix = filtered_returns.cov()
corr_matrix = filtered_returns.corr()


In [34]:
cov_matrix

Unnamed: 0,Return
Return,2.9e-05


In [35]:
corr_matrix

Unnamed: 0,Return
Return,1.0


In [36]:
weights = get_hrp_weights(cov_matrix, corr_matrix)

ValueError: The number of observations cannot be determined on an empty distance matrix.