In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from reinforcetrader.data_pipeline import RawDataLoader

# Reward Function Definition

We first define certain notations used in the reward functions:

$$
H_b = \{10, 15, 20, 25\}, \quad H_s = \{3, 5, 10\}.
$$

$$
R_h^{\mathrm{ex}}(t)
= \ln\!\left(\frac{P_{t+h}}{P_t}\right)
  - \ln\!\left(\frac{B_{t+h}}{B_t}\right).
$$

$$
w_{h,S,\tau}^{+}(t)
= \frac{e^{R_h^{\mathrm{ex}}(t)/\tau}}
       {\sum_{j \in S} e^{R_j^{\mathrm{ex}}(t)/\tau}},
\quad
w_{h,S,\tau}^{-}(t)
= \frac{e^{-R_h^{\mathrm{ex}}(t)/\tau}}
       {\sum_{j \in S} e^{-R_j^{\mathrm{ex}}(t)/\tau}},
\space
\mathrm{where}\space h \in S
$$

The reward functions are defined as:

$$
r_{\mathrm{buy}}^{t}
= \sum_{h \in H_b} w_{h,H_b,\tau_b}^{+}(t)\, R_h^{\mathrm{ex}}(t)
  - \mathrm{cost} \cdot \bigl|\Delta \mathrm{pos}\bigr|.
$$

$$
r_{\mathrm{sell}}^{t}
= \max\!\left(0,\,
  -\sum_{h \in H_s} w_{h,H_s,\tau_s}^{-}(t)\, R_h^{\mathrm{ex}}(t)\right)
  - \mathrm{cost} \cdot \bigl|\Delta \mathrm{pos}\bigr|.
$$

$$
r_{\mathrm{hold\mbox{-}in}}^{t}
= \max\!\left(0,\,
  \sum_{h \in H_b} w_{h,H_b,\tau_b}^{+}(t)\, R_h^{\mathrm{ex}}(t)
  + \sum_{h \in H_s} w_{h,H_s,\tau_s}^{-}(t)\, R_h^{\mathrm{ex}}(t)\right).
$$

$$
r_{\mathrm{hold\mbox{-}out}}^{t}
= \max\!\left(0,\,
  -\sum_{h \in H_b} w_{h,H_b,\tau_b}^{-}(t)\, R_h^{\mathrm{ex}}(t)\right)
  - \max\!\left(0,\,
  \sum_{h \in H_b} w_{h,H_b,\tau_b}^{+}(t)\, R_h^{\mathrm{ex}}(t)\right).
$$


## Compute Benchmark Returns
We 

In [2]:
# Define the stock and index ticker for yfinance
stock = 'AAPL'
index = 'DJI'

# Download the data for stocks and the benchmark index
data_loader = RawDataLoader(start_date='2000-01-01', end_date='2000-05-01', tickers=[stock, index], verbose=False)

# Extract the close prices
hist_prices = data_loader.get_hist_prices().xs(key='Close', level='Price', axis=1)

In [10]:
# Compute the excess returns
# Define the buy and sell look-ahead horizons
horizons = [3, 5, 10, 15, 20, 25]

# Store the excess returns in dataframe
excess_returns = pd.DataFrame(index=hist_prices.index)

# Compute excess returns for each horizon
for h in horizons:
    hFR = np.log(hist_prices.shift(-h) / hist_prices)
    excess_returns[f'{h}ER'] = hFR[stock] - hFR[index]

# Drop NA rows
excess_returns.dropna(inplace=True)

# Inspect the excess returns
excess_returns.head()

Unnamed: 0_level_0,3ER,5ER,10ER,15ER,20ER,25ER
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,-0.154847,-0.154254,-0.091884,0.032056,-0.08202,0.061749
2000-01-04,-0.076311,-0.145563,-0.004852,0.069134,-0.037121,0.121735
2000-01-05,-0.101594,-0.214121,0.067069,0.064641,0.003234,0.131442
2000-01-06,-0.046619,-0.010572,0.158607,0.114201,0.154317,0.21161
2000-01-07,-0.134567,-0.007865,0.111307,0.093665,0.191604,0.242866


In [41]:
# define function to compute the weights for excess return horizons
def softmax_weights(t: str, h: int, S: list[int], tau: float, positive: bool = True) -> float:
    # w^{+} if positive=True, w^{-} if positive=False
    if tau <= 0:
        raise ValueError("tau must be > 0")
    t_idx = pd.to_datetime(t)

    cols = [f"{hzn}ER" for hzn in S]
    row = excess_returns.loc[t_idx, cols].astype(float).to_numpy()

    sign = 1.0 if positive else -1.0
    logits = (sign * row) / tau
    m = np.max(logits)
    denom = np.exp(logits - m).sum()
    num_val = float(excess_returns.loc[t_idx, f"{h}ER"])
    num = np.exp((sign * num_val) / tau - m)
    return float(num / denom)
    
# define function to compute the weighted sum of excess returns
def sum_weighted_returns(t: str, S: list[int], tau: float, positive: bool = True) -> float:
    t_idx = pd.to_datetime(t)
    out = 0.0
    for h in S:
        w = softmax_weights(t, h, S, tau, positive)   # w^{+} or w^{-}
        rex = float(excess_returns.loc[t_idx, f"{h}ER"])
        out += w * rex
    return out

In [45]:
# Define the reward functions as outlined earlier
def buy_reward(t: str, Hb: list[int], tb: float, cost: float):
    return sum_weighted_returns(t, Hb, tb, positive=True) - cost

def sell_reward(t: str, Hs: list[int], ts: float, cost: float):
    g = -sum_weighted_returns(t, Hs, ts, positive=False)
    return max(0.0, g) - cost

def hold_in_reward(t: str, Hb: list[int], Hs: list[int], tb: float, ts: float):
    g_pos = sum_weighted_returns(t, Hb, tb, positive=True)
    g_neg = sum_weighted_returns(t, Hs, ts, positive=False)
    return max(0.0, g_pos + g_neg)

def hold_out_reward(t: str, Hb: list[int], tb: float):
    avoid_loss = -sum_weighted_returns(t, Hb, tb, positive=False)
    miss_gain  = sum_weighted_returns(t, Hb, tb, positive=True)
    return max(0.0, avoid_loss) - max(0.0, miss_gain)