In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

In [2]:
dfs = pd.read_csv('/Users/ryanhuang/Developer/transformers/FFdaily.CSV')
#dfs = pd.read_csv('FFdaily.CSV')

dfs['Return'] = dfs['Mkt-RF']
# Drop rows with missing values
# Replace -inf with NaN
dfs.replace(-np.inf, np.nan, inplace=True)
dfs.replace(np.inf, np.nan, inplace=True)
dfs.dropna(inplace=True)
dfs['Returndmy'] = dfs['Return'].apply(lambda x: 1 if x > 0 else 0)

# Assuming your dataset is stored in a pandas DataFrame called 'data'
dfs['Date'] = dfs['Date'].astype(str)  # Convert 'Date' column to string
dfs['Year'] = dfs['Date'].str[:4]  # Extract the first 4 digits as the year
dfs['Year'] = pd.to_numeric(dfs['Year'])  # Convert 'Year' column to numeric
dfs['Date'] = pd.to_numeric(dfs['Date'])  # Convert 'Year' column to numeric

dfs = dfs[dfs['Year'] >= 1927]

# dfs = dfs[dfs['Year'] <= 1932]

# Drop the last 400 rows
#dfs = dfs.drop(dfs.index[-(4360):])
#dfs = dfs.drop(dfs.index[-(250*1):])

# Reset the index of the dataframe
dfs = dfs.reset_index(drop=True)
dfs = dfs.reset_index()
df = dfs
df0 = dfs
print(df0)

# Maximum value and its index in the column
max_value = df['Return'].max()
max_index = df['Return'].idxmax()

# Minimum value and its index in the column
min_value = df['Return'].min()
min_index = df['Return'].idxmin()

print("Maximum value:", max_value, "at index:", max_index)
print("Minimum value:", min_value, "at index:", min_index)

# # Extract the column for AR(1) modeling
column_data = df['Return']  # Replace 'Return' with the actual column name from your DataFrame

# Create lagged values of the column
lagged_data = column_data.shift(1).dropna()

# Add a constant column to the lagged data
lagged_data = sm.add_constant(lagged_data)

# Fit the AR(1) model using OLS
ar_ols_model = sm.OLS(column_data[1:], lagged_data)
ar_ols_result = ar_ols_model.fit()

# Display summary of coefficients
print(ar_ols_result.summary())


       index      Date  Mkt-RF   idxd  Return  Returndmy  Year
0          0  19270103   -0.79    151   -0.79          0  1927
1          1  19270104    0.31    152    0.31          1  1927
2          2  19270105    0.14    153    0.14          1  1927
3          3  19270106   -0.17    154   -0.17          0  1927
4          4  19270107    0.30    155    0.30          1  1927
...      ...       ...     ...    ...     ...        ...   ...
25325  25325  20230424    0.00  25476    0.00          0  2023
25326  25326  20230425   -1.76  25477   -1.76          0  2023
25327  25327  20230426   -0.41  25478   -0.41          0  2023
25328  25328  20230427    1.85  25479    1.85          1  2023
25329  25329  20230428    0.77  25480    0.77          1  2023

[25330 rows x 7 columns]
Maximum value: 15.76 at index: 1836
Minimum value: -17.44 at index: 16377
                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:               

## Neural Network OneShot

In [3]:
# ---------------------------------------------------------
# 2) Load and clean your dataset
# ---------------------------------------------------------
dfs = pd.read_csv('/Users/ryanhuang/Developer/transformers/FFdaily.CSV')
dfs['Return'] = dfs['Mkt-RF']
dfs.replace([-np.inf, np.inf], np.nan, inplace=True)
dfs.dropna(inplace=True)
dfs['Returndmy'] = dfs['Return'].apply(lambda x: 1 if x > 0 else 0)

# Convert 'Date' to string, then extract year and finally convert Date to int 
# (if needed, you can later convert it back to datetime)
dfs['Date'] = dfs['Date'].astype(str)
dfs['Year'] = dfs['Date'].str[:4].astype(int)
dfs['Date'] = dfs['Date'].astype(int)

# Filter from 1927 onward
dfs = dfs[dfs['Year'] >= 1927].copy()
dfs.reset_index(drop=True, inplace=True)

# Sort by date
dfs = dfs.sort_values('Date').reset_index(drop=True)

# If you have an 'idxd' column, note it for alignment
idxd_col_exists = ('idxd' in dfs.columns)

print("Data shape after cleaning:", dfs.shape)
print(dfs.head())

Data shape after cleaning: (25330, 6)
       Date  Mkt-RF  idxd  Return  Returndmy  Year
0  19270103   -0.79   151   -0.79          0  1927
1  19270104    0.31   152    0.31          1  1927
2  19270105    0.14   153    0.14          1  1927
3  19270106   -0.17   154   -0.17          0  1927
4  19270107    0.30   155    0.30          1  1927


In [4]:
# Dataset with 5 lags (using past values)
df_lags5 = dfs.sort_values('idxd').copy()
df_lags5.set_index('idxd', inplace=True)

# Create lag columns for lags 1 through 5 using the 'Return' column.
for i in range(1, 6):
    df_lags5[f'lag_{i}'] = df_lags5['Return'].shift(i)

# Reset the index to bring 'idxd' back as a column.
df_lags5 = df_lags5.reset_index()

# Drop rows where any of the 5 lag columns contain NaN values.
df_lags5 = df_lags5.dropna(subset=[f'lag_{i}' for i in range(1, 6)])

print("Dataset with 5 lags:")
print(df_lags5.head())


Dataset with 5 lags:
   idxd      Date  Mkt-RF  Return  Returndmy  Year  lag_1  lag_2  lag_3  \
5   156  19270108    0.39    0.39          1  1927   0.30  -0.17   0.14   
6   157  19270110    0.14    0.14          1  1927   0.39   0.30  -0.17   
7   158  19270111   -0.28   -0.28          0  1927   0.14   0.39   0.30   
8   159  19270112    0.07    0.07          1  1927  -0.28   0.14   0.39   
9   160  19270113   -0.04   -0.04          0  1927   0.07  -0.28   0.14   

   lag_4  lag_5  
5   0.31  -0.79  
6   0.14   0.31  
7  -0.17   0.14  
8   0.30  -0.17  
9   0.39   0.30  


In [5]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate):
        super(SimpleNN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)

        self.linear2 = nn.Linear(hidden_size, 32)
        self.linear3 = nn.Linear(32, 16)
        self.linear4 = nn.Linear(16, 8)
        self.linear5 = nn.Linear(8, 1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Layer 1
        out = self.linear1(x)
        out = self.relu(out)
        out = self.dropout(out)

        # Layer 2
        out = self.linear2(out)
        out = self.relu(out)
        out = self.dropout(out)

        # Layer 3
        out = self.linear3(out)
        out = self.relu(out)
        out = self.dropout(out)

        # Layer 4
        out = self.linear4(out)
        out = self.relu(out)
        out = self.dropout(out)

        # Output
        out = self.linear5(out)
        return out


def train_model(num_epochs,
                hidden_size,
                dropout_rate,
                learning_rate,
                X_train_fold,
                y_train_fold,
                X_val_fold,
                y_val_fold,
                device='cpu'):
    """
    Trains a PyTorch MLP with the given hyperparams on (X_train_fold, y_train_fold),
    then returns the validation MSE on (X_val_fold, y_val_fold).
    """
    input_dim = X_train_fold.shape[1]
    
    # Convert to PyTorch tensors
    X_train_t = torch.tensor(X_train_fold.values, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train_fold.values.reshape(-1, 1), dtype=torch.float32).to(device)
    X_val_t   = torch.tensor(X_val_fold.values,   dtype=torch.float32).to(device)
    y_val_t   = torch.tensor(y_val_fold.values.reshape(-1, 1),   dtype=torch.float32).to(device)
    
    model = SimpleNN(input_dim, hidden_size, dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    model.train()
    for epoch in range(num_epochs):
        # Forward pass
        y_pred = model(X_train_t)
        loss = criterion(y_pred, y_train_t)

        # Backprop and update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # (Optional) check progress every 100 epochs
        if (epoch+1) % 100 == 0:
            # Evaluate on validation
            model.eval()
            with torch.no_grad():
                val_pred = model(X_val_t)
                val_loss = criterion(val_pred, y_val_t)
            model.train()
            print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {loss.item():.6f} | Val Loss: {val_loss.item():.6f}")

    # Final validation MSE
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_t)
    mse_val = mean_squared_error(y_val_fold, val_pred.cpu().numpy())

    return model, mse_val

In [9]:
feature_cols = [f'lag_{i}' for i in range(1, 6)]
target_col = 'Return'

param_grid = {
    'num_epochs':   [500],
    'hidden_size':  [32, 64],
    'dropout_rate': [0.1, 0.2],
    'learning_rate':[0.001, 0.01],
}

rolling_results = []
all_predictions = []

# Each window: 4 years train, 1 year validation, 1 year test
min_year = df_lags5['Year'].min()
max_year = df_lags5['Year'].max()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for test_year in range(min_year + 5, max_year + 1):
    train_years = list(range(test_year - 5, test_year - 1))  # e.g. [test_year-5 .. test_year-2]
    val_year = test_year - 1
    
    # Masks
    train_mask = df_lags5['Year'].isin(train_years)
    val_mask   = (df_lags5['Year'] == val_year)
    test_mask  = (df_lags5['Year'] == test_year)
    
    # Slices
    X_train = df_lags5.loc[train_mask, feature_cols]
    y_train = df_lags5.loc[train_mask, target_col]

    X_val   = df_lags5.loc[val_mask, feature_cols]
    y_val   = df_lags5.loc[val_mask, target_col]

    X_test  = df_lags5.loc[test_mask, feature_cols]
    y_test  = df_lags5.loc[test_mask, target_col]
    
    # Combine train+val
    X_train_val = pd.concat([X_train, X_val], axis=0)
    y_train_val = pd.concat([y_train, y_val], axis=0)
    
    # ----------------------
    # TimeSeriesSplit for the grid search
    # ----------------------
    tscv = TimeSeriesSplit(n_splits=5)

    # Flatten param_grid into a list of param dicts
    from itertools import product
    param_list = []
    for (ne, hs, dr, lr) in product(
        param_grid['num_epochs'],
        param_grid['hidden_size'],
        param_grid['dropout_rate'],
        param_grid['learning_rate']
    ):
        param_list.append({
            'num_epochs': ne,
            'hidden_size': hs,
            'dropout_rate': dr,
            'learning_rate': lr
        })

    best_val_mse = float('inf')
    best_params  = None

    Xtv = X_train_val.reset_index(drop=True)
    ytv = y_train_val.reset_index(drop=True)
    
    # Manual grid search
    for params in param_list:
        fold_mses = []
        for train_index, val_index in tscv.split(Xtv):
            X_train_fold = Xtv.iloc[train_index]
            y_train_fold = ytv.iloc[train_index]
            X_val_fold   = Xtv.iloc[val_index]
            y_val_fold   = ytv.iloc[val_index]

            model_fold, mse_val = train_model(
                num_epochs   = params['num_epochs'],
                hidden_size  = params['hidden_size'],
                dropout_rate = params['dropout_rate'],
                learning_rate= params['learning_rate'],
                X_train_fold = X_train_fold,
                y_train_fold = y_train_fold,
                X_val_fold   = X_val_fold,
                y_val_fold   = y_val_fold,
                device=device
            )
            fold_mses.append(mse_val)

        avg_mse = np.mean(fold_mses)
        if avg_mse < best_val_mse:
            best_val_mse = avg_mse
            best_params  = params.copy()

    # ---------------
    # Retrain final model on full train+val with best_params
    # ---------------
    final_num_epochs   = best_params['num_epochs']
    final_hidden_size  = best_params['hidden_size']
    final_dropout_rate = best_params['dropout_rate']
    final_learning_rate= best_params['learning_rate']

    # Convert train+val to torch
    Xtv_torch = torch.tensor(X_train_val.values, dtype=torch.float32).to(device)
    ytv_torch = torch.tensor(y_train_val.values.reshape(-1,1), dtype=torch.float32).to(device)

    input_dim = Xtv_torch.shape[1]
    final_model = SimpleNN(input_dim, final_hidden_size, final_dropout_rate).to(device)
    optimizer = optim.Adam(final_model.parameters(), lr=final_learning_rate)
    criterion = nn.MSELoss()

    final_model.train()
    for epoch in range(final_num_epochs):
        y_pred = final_model(Xtv_torch)
        loss   = criterion(y_pred, ytv_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # ---------------
    # Evaluate on test
    # ---------------
    X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).to(device)
    final_model.eval()
    with torch.no_grad():
        preds_test_torch = final_model(X_test_torch).cpu().numpy().ravel()

    test_mse = mean_squared_error(y_test, preds_test_torch)

    print(f"[Year={test_year}] Best Params={best_params} CV-MSE={best_val_mse:.6f} Test MSE={test_mse:.6f}")

    # Store rolling result
    rolling_results.append({
        'test_year': test_year,
        'train_years': train_years,
        'val_year': val_year,
        'best_params': best_params,
        'cv_mse': best_val_mse,
        'test_mse': test_mse
    })

    # Store predictions
    df_test_window = df_lags5.loc[test_mask, ['Date', 'idxd', 'Return']].copy()
    df_test_window['prediction'] = preds_test_torch
    df_test_window['test_year'] = test_year
    all_predictions.append(df_test_window)
    # print year is done
    print(f"Year {test_year} done.")
    print("---------------------------------------------------")


Epoch [100/500] | Train Loss: 0.260135 | Val Loss: 0.531476
Epoch [200/500] | Train Loss: 0.197307 | Val Loss: 0.581761
Epoch [300/500] | Train Loss: 0.200111 | Val Loss: 0.632990
Epoch [400/500] | Train Loss: 0.151793 | Val Loss: 0.655815
Epoch [500/500] | Train Loss: 0.156948 | Val Loss: 0.655757
Epoch [100/500] | Train Loss: 0.390132 | Val Loss: 0.856215
Epoch [200/500] | Train Loss: 0.360550 | Val Loss: 0.874559
Epoch [300/500] | Train Loss: 0.342784 | Val Loss: 0.863543
Epoch [400/500] | Train Loss: 0.317804 | Val Loss: 0.879505
Epoch [500/500] | Train Loss: 0.294335 | Val Loss: 0.893375
Epoch [100/500] | Train Loss: 0.543151 | Val Loss: 4.573287
Epoch [200/500] | Train Loss: 0.516792 | Val Loss: 4.449750
Epoch [300/500] | Train Loss: 0.479005 | Val Loss: 5.309966
Epoch [400/500] | Train Loss: 0.449880 | Val Loss: 5.846119
Epoch [500/500] | Train Loss: 0.432281 | Val Loss: 5.831481
Epoch [100/500] | Train Loss: 1.232120 | Val Loss: 2.771364
Epoch [200/500] | Train Loss: 0.871461 |

In [10]:
results_df = pd.concat(all_predictions, ignore_index=True)
results_df.rename(columns={'Return': 'actual'}, inplace=True)
results_df = results_df[['Date', 'idxd', 'actual', 'prediction', 'test_year']]
results_df.columns = ['date', 'idxd', 'actual', 'prediction', 'test_year']

rolling_results_df = pd.DataFrame(rolling_results)

print("\n=== Rolling Window Results (summary) ===")
print(rolling_results_df)

print("\n=== Sample of combined predictions ===")
print(results_df.head())

# Save predictions if desired
results_df.to_csv("/Users/ryanhuang/Developer/transformers/results/RollingNNBlock5.csv", index=False)



=== Rolling Window Results (summary) ===
    test_year               train_years  val_year  \
0        1932  [1927, 1928, 1929, 1930]      1931   
1        1933  [1928, 1929, 1930, 1931]      1932   
2        1934  [1929, 1930, 1931, 1932]      1933   
3        1935  [1930, 1931, 1932, 1933]      1934   
4        1936  [1931, 1932, 1933, 1934]      1935   
..        ...                       ...       ...   
87       2019  [2014, 2015, 2016, 2017]      2018   
88       2020  [2015, 2016, 2017, 2018]      2019   
89       2021  [2016, 2017, 2018, 2019]      2020   
90       2022  [2017, 2018, 2019, 2020]      2021   
91       2023  [2018, 2019, 2020, 2021]      2022   

                                          best_params    cv_mse  test_mse  
0   {'num_epochs': 500, 'hidden_size': 32, 'dropou...  2.799084  8.607568  
1   {'num_epochs': 500, 'hidden_size': 32, 'dropou...  4.877116  6.882683  
2   {'num_epochs': 500, 'hidden_size': 32, 'dropou...  5.794985  1.903509  
3   {'num_epochs'

In [11]:
df1 = results_df

# Create a lagged version of the actual values
df1['Lagged_Actual'] = df1['actual'].shift(1)

# Convert columns to numeric, especially 'Prediction'
df1['Prediction'] = pd.to_numeric(df1['prediction'], errors='coerce')

# Drop any rows with NaN values after conversion
df1 = df1.dropna(subset=['actual', 'prediction', 'Lagged_Actual'])

# Define the dependent and independent variables
X = df1[['prediction', 'Lagged_Actual']]  # Independent variables
# X = df1[['prediction']]  # Independent variables

y = df1['actual']  # Dependent variable

# Add a constant to the independent variables (for intercept)
X = sm.add_constant(X)

# print(y)
# print(X)

# # Check the shapes
# print(X.shape, y.shape)

# # Check data types
# print(X.dtypes)
# print(y.dtypes)


# Run the regression
model = sm.OLS(y, X).fit()

# Optionally, print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 actual   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     54.37
Date:                Fri, 14 Mar 2025   Prob (F-statistic):           2.76e-24
Time:                        17:38:33   Log-Likelihood:                -35023.
No. Observations:               23844   AIC:                         7.005e+04
Df Residuals:                   23841   BIC:                         7.008e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0285      0.007      4.173

In [12]:
import pandas as pd
import statsmodels.api as sm

# ----------------------
# 1. Prepare the dataset
# ----------------------
df1 = results_df.copy()

# Create a lagged version of 'actual'
df1['Lagged_Actual'] = df1['actual'].shift(1)

# Convert 'prediction' to numeric
df1['Prediction'] = pd.to_numeric(df1['prediction'], errors='coerce')

# Drop rows with missing values
df1 = df1.dropna(subset=['actual', 'Prediction', 'Lagged_Actual'])

# ---------------------------
# 2. Split data into two halves
# ---------------------------
n = len(df1)
midpoint = n // 2  # integer division

df1_first_half = df1.iloc[:midpoint].copy()
df1_second_half = df1.iloc[midpoint:].copy()

# ---------------------------
# 3. Define X and y for each half
# ---------------------------
# Example: X includes 'Prediction' and 'Lagged_Actual'
X1 = df1_first_half[['Prediction', 'Lagged_Actual']]
y1 = df1_first_half['actual']

X2 = df1_second_half[['Prediction', 'Lagged_Actual']]
y2 = df1_second_half['actual']

# Add a constant for the intercept
X1 = sm.add_constant(X1)
X2 = sm.add_constant(X2)

# ---------------------------
# 4. Fit OLS models and summarize
# ---------------------------
model_first_half = sm.OLS(y1, X1).fit()
model_second_half = sm.OLS(y2, X2).fit()

print("=== First Half Results ===")
print(model_first_half.summary())
print("\n=== Second Half Results ===")
print(model_second_half.summary())

=== First Half Results ===
                            OLS Regression Results                            
Dep. Variable:                 actual   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     105.4
Date:                Fri, 14 Mar 2025   Prob (F-statistic):           4.09e-46
Time:                        17:38:36   Log-Likelihood:                -17097.
No. Observations:               11922   AIC:                         3.420e+04
Df Residuals:                   11919   BIC:                         3.422e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0

In [13]:
import pandas as pd
import statsmodels.api as sm

df1 = results_df.copy()

# Create 5 lagged versions of the 'actual' values
df1['Lag1_Actual'] = df1['actual'].shift(1)
df1['Lag2_Actual'] = df1['actual'].shift(2)
df1['Lag3_Actual'] = df1['actual'].shift(3)
df1['Lag4_Actual'] = df1['actual'].shift(4)
df1['Lag5_Actual'] = df1['actual'].shift(5)

# Convert 'prediction' column to numeric (in case it's not)
df1['Prediction'] = pd.to_numeric(df1['prediction'], errors='coerce')

# Drop rows with NaN values in required columns
# (this ensures that for any row we have actual, Prediction, and all lagged values)
df1 = df1.dropna(subset=['actual', 'Prediction',
                         'Lag1_Actual', 'Lag2_Actual',
                         'Lag3_Actual', 'Lag4_Actual',
                         'Lag5_Actual'])

# Define the dependent (y) and independent (X) variables
y = df1['actual']

# Here we include the original Prediction plus all 5 lags
X = df1[['Prediction', 'Lag1_Actual', 'Lag2_Actual',
         'Lag3_Actual', 'Lag4_Actual', 'Lag5_Actual']]


# # Here we include the original Prediction plus all 5 lags
# X = df1[['Lag1_Actual']]

# Add a constant (intercept) to the independent variables
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 actual   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     18.42
Date:                Fri, 14 Mar 2025   Prob (F-statistic):           1.80e-21
Time:                        17:38:40   Log-Likelihood:                -34994.
No. Observations:               23840   AIC:                         7.000e+04
Df Residuals:                   23833   BIC:                         7.006e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0289      0.007      4.226      

In [None]:
df = df1

import numpy as np
import matplotlib.pyplot as plt

# Assuming predictions and actuals are numpy arrays from previous calculations
# Generate x values for the x-axis

actuals = df['actual']
predictions = df['prediction']

x_values = np.arange(len(df['actual']))

# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(x_values, actuals, label='Actual Values', color='red', marker='x', linestyle='-', markersize=4)
plt.plot(x_values, predictions, label='Predictions', color='blue', marker='o', linestyle='-', markersize=4)


# Add titles and labels
plt.title('Predictions vs Actual Values')
plt.xlabel('Days')
plt.ylabel('Values')
plt.legend()
plt.grid()

# Show the plot
plt.show()

NameError: name 'df1' is not defined

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1) Make a copy of results_df and sort by time
df_plot = results_df.copy()
df_plot = df_plot.sort_values("idxd", ascending=True).reset_index(drop=True)

# 2) If 'actual' and 'prediction' are in PERCENT (2 means +2%), 
#    convert to decimal returns by dividing by 100 when doing the cumulative product.
df_plot['Actual_CumRet'] = (1 + df_plot['actual'] / 100).cumprod() - 1
df_plot['Pred_CumRet']   = (1 + df_plot['prediction'] / 100).cumprod() - 1

# 3) Plot cumulative returns with smaller markers
plt.figure(figsize=(10, 6))
plt.plot(df_plot['idxd'], df_plot['Actual_CumRet'], 
         label='Actual Cumulative Return', 
         marker='o',
         markersize=2)  # <--- smaller dot size

plt.plot(df_plot['idxd'], df_plot['Pred_CumRet'], 
         label='Predicted Cumulative Return', 
         marker='x',
         markersize=2)  # <--- smaller dot size

plt.title('Cumulative Return: Actual vs. Predicted')
plt.xlabel('idxd (Time Index)')
plt.ylabel('Cumulative Return')
plt.legend()
plt.grid()
plt.show()


NameError: name 'results_df' is not defined