##### Import

In [14]:
import os
import sys
import warnings
import gc
from pathlib import Path

import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from tqdm import tqdm
import shap

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice

#### Test on unseen data

In [1]:
from pathlib import Path
import pandas as pd
from utils import rank_stocks_and_quantile
import re

top = 250

model_path = "models/250_best_model_TARGET_ret_fwd_01d_rank_quantiled.txt"

# Use regular expression to extract the target name
match = re.search(r"best_model_(.*).txt", model_path)

if match:
    target = match.group(1)
else:
    print("Target not found in the provided path.")

# unseen_key = '/data/YEAR_20220803_20230803'
# unseen_store_path = Path(f'data/{top}_unseen_dataset.h5')

# # Load dataset and rank stocks
# with pd.HDFStore(unseen_store_path) as store:
#     dataset = store[unseen_key]
#     dataset_ranked = rank_stocks_and_quantile(dataset, target)




# Localize datetime index if not already done
datetime_level = 0  # Assuming the datetime is the first level
if dataset_ranked.index.levels[datetime_level].tz is None:
    localized_level = dataset_ranked.index.levels[datetime_level].tz_localize('UTC')
    dataset_ranked.index = dataset_ranked.index.set_levels(localized_level, level=datetime_level)

# Get unique dates and sort them
unique_dates = dataset_ranked.index.get_level_values('date').unique().sort_values()

# Extract feature columns and label columns
features = [col for col in dataset_ranked.columns if col.startswith('FEATURE_')]
label_cols = [col for col in dataset_ranked.columns if col.startswith('TARGET_')]

# Adjust for the look-ahead gap and get test dates
look_ahead = 1
test_dates = unique_dates[-21*9:]

# Extract the test data subset
test_data = dataset_ranked[dataset_ranked.index.isin(test_dates, level='date')]

print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 47963 entries, (Timestamp('2022-11-01 00:00:00+0000', tz='UTC'), 'AA') to (Timestamp('2023-08-03 00:00:00+0000', tz='UTC'), 'ZTS')
Columns: 598 entries, FEATURE_open to FEATURE_alpha_101
dtypes: float32(360), float64(31), int32(198), int8(9)
memory usage: 114.1+ MB
None


In [2]:
import lightgbm as lgb
import pandas as pd

def predict_and_format(model_path: str, \
                    test_data: pd.DataFrame, \
                    features: list, label: str) -> pd.DataFrame:
    """
    Load a LightGBM model from the specified path, make predictions on the test data, and format the results.

    Parameters:
    - model_path: Path to the saved LightGBM model.
    - test_data: Test dataframe containing features and labels.
    - features: List of feature column names.
    - label: Column name of the label.

    Returns:
    - preds: Formatted dataframe with predictions and selected feature data.
    """

    # Load the model
    best_model = lgb.Booster(model_file=model_path)

    # Extract features and labels
    test_features = test_data[features]
    test_labels = test_data[label]

    # Make predictions
    y_pred = best_model.predict(test_features)

    # Format the predictions dataframe
    preds = test_labels.reset_index(name='actual').assign(predicted=\
        y_pred).set_index(['date', 'ticker'])

    # Rename columns to add 'feature_' prefix
    cols_to_rename = ['open', 'high', 'low', 'close', 'volume']
    new_col_names = ["FEATURE_" + col for col in cols_to_rename]
    rename_dict = dict(zip(cols_to_rename, new_col_names))

    test_data_renamed = test_data.rename(columns=rename_dict)

    # Join with selected feature data
    preds = preds.reset_index().merge(test_data_renamed[new_col_names].reset_index(), 
                                      on=['ticker', 'date'], 
                                      how='left')

    # Filter columns of interest
    preds = preds[['date', 'ticker', 'actual', 'predicted'] + new_col_names].set_index(['ticker', 'date'])
    
    return preds
    
preds = predict_and_format(model_path, test_data, features, target)



KeyError: 'TARGET_ret_fwd_01d_rank_quantiled'

In [17]:
preds

Unnamed: 0_level_0,Unnamed: 1_level_0,actual,predicted,FEATURE_open,FEATURE_high,FEATURE_low,FEATURE_close,FEATURE_volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,2022-11-01 00:00:00+00:00,3.086847,0.489147,40.770000,42.130001,40.259998,40.880001,6.356120e+06
AAL,2022-11-01 00:00:00+00:00,1.381899,0.458259,14.440000,14.535000,14.095000,14.110000,2.560694e+07
AAPL,2022-11-01 00:00:00+00:00,8.293037,0.519029,154.822998,155.192993,148.882996,150.401001,8.051268e+07
ABBV,2022-11-01 00:00:00+00:00,9.931825,0.509770,146.660004,147.699997,144.550003,146.910004,6.351399e+06
ABT,2022-11-01 00:00:00+00:00,5.635224,0.487728,99.620003,100.317398,98.639999,99.309998,5.249432e+06
...,...,...,...,...,...,...,...,...
WYNN,2023-08-03 00:00:00+00:00,17.513336,0.512552,104.250000,106.849998,102.129997,105.610001,1.614293e+06
X,2023-08-03 00:00:00+00:00,4.920916,0.461782,24.360001,24.865000,23.750000,24.260000,9.083687e+06
XOM,2023-08-03 00:00:00+00:00,9.916818,0.497602,105.889999,107.879997,105.300003,107.120003,1.628311e+07
YUM,2023-08-03 00:00:00+00:00,8.683042,0.491302,134.410004,136.259995,133.470001,135.619995,2.392271e+06


In [18]:
def daily_spearman(group):
    return spearmanr(group['actual'], group['predicted'])[0]

daily_correlations = preds.groupby('date').apply(daily_spearman)

In [19]:
# Calculate the mean and standard deviation of daily correlations
mean_daily_correlation = daily_correlations.mean()
std_daily_correlation = daily_correlations.std()

# Calculate Sharpe ratio for each date
papermill_era_scores = daily_sharpe_ratios = mean_daily_correlation / std_daily_correlation

# Check if papermill_era_scores is a series or scalar and convert to dataframe
if isinstance(papermill_era_scores, pd.Series):
    papermill_era_scores_df = papermill_era_scores.to_frame()
    papermill_era_scores_df.columns = papermill_era_scores_df.columns.astype(str)
else:
    # Convert scalar to dataframe
    papermill_era_scores_df = pd.DataFrame([papermill_era_scores], columns=["Sharpe_Ratio"])

sb.glue("papermill_era_scores", papermill_era_scores_df, display=True)

papermill_era_scores_list = papermill_era_scores.tolist() if isinstance(papermill_era_scores, pd.Series) else [papermill_era_scores]
sb.glue("papermill_era_scores", papermill_era_scores_list)


Unnamed: 0,Sharpe_Ratio
0,57.917082


In [20]:
# import matplotlib.pyplot as plt
# from pathlib import Path

# # Create a list of colors based on the sign of the Sharpe Ratios
# colors = ['blue' if value > 0 else 'red' for value in daily_sharpe_ratios]

# plt.figure(figsize=(12,6))
# daily_sharpe_ratios.plot(kind='bar', color=colors)
# plt.title('Daily Sharpe Ratios')
# plt.xlabel('Date')
# plt.ylabel('Sharpe Ratio')
# plt.grid(axis='y')
# plt.tight_layout()
# plt.axhline(y=0, color='black', linestyle='-')  # Add horizontal line at y=0
# plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility

# # Define directory and clean up the dataset_key
# plot_dir = Path("plots")
# clean_dataset_key = unseen_key.replace("/", "_")

# # Create the plots directory if it doesn't exist
# plot_dir.mkdir(exist_ok=True)

# # Define the save path for the plot using the cleaned key
# plot_path = plot_dir / f"sharpe_ratios_{clean_dataset_key}.png"

# # Save the plot
# plt.savefig(plot_path)
# plt.close()

# # Convert the path to string and glue it
# papermill_plot_path_str = str(plot_path)
# sb.glue("papermill_plot_path", papermill_plot_path_str, display=True)

In [21]:
lr_r, lr_p = spearmanr(preds.actual, preds.predicted)
print(f'Information Coefficient (overall): {lr_r:.3%} (p-value: {lr_p:.8%})')

# Return the Information Coefficient and its p-value
information_coefficient = lr_r
p_value = lr_p

# information_coefficient = papermill_information_coefficient, p_value = papermill_p_value
sb.glue("information_coefficient", information_coefficient, display=True)
sb.glue("p_value", p_value, display=True)

Information Coefficient (overall): 94.590% (p-value: 0.00000000%)


0.945899868381928

0.0

In [22]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49685 entries, ('AA', Timestamp('2022-11-01 00:00:00+0000', tz='UTC')) to ('ZTS', Timestamp('2023-08-03 00:00:00+0000', tz='UTC'))
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   actual          49685 non-null  float32
 1   predicted       49685 non-null  float64
 2   FEATURE_open    49685 non-null  float32
 3   FEATURE_high    49685 non-null  float32
 4   FEATURE_low     49685 non-null  float32
 5   FEATURE_close   49685 non-null  float32
 6   FEATURE_volume  49685 non-null  float64
dtypes: float32(5), float64(2)
memory usage: 1.9+ MB


In [23]:
def add_quantile_signals(df, buy_threshold=0.95, sell_threshold=0.05):
    buy_cutoff = df['predicted'].quantile(buy_threshold)
    sell_cutoff = df['predicted'].quantile(sell_threshold)
    
    df['signal'] = 0  # Neutral by default
    df.loc[df['predicted'] >= buy_cutoff, 'signal'] = 1  # Buy
    df.loc[df['predicted'] <= sell_cutoff, 'signal'] = -1  # Sell
    return df

# Apply the function
preds = add_quantile_signals(preds, \
    buy_threshold=0.95, sell_threshold=0.05)

In [24]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49685 entries, ('AA', Timestamp('2022-11-01 00:00:00+0000', tz='UTC')) to ('ZTS', Timestamp('2023-08-03 00:00:00+0000', tz='UTC'))
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   actual          49685 non-null  float32
 1   predicted       49685 non-null  float64
 2   FEATURE_open    49685 non-null  float32
 3   FEATURE_high    49685 non-null  float32
 4   FEATURE_low     49685 non-null  float32
 5   FEATURE_close   49685 non-null  float32
 6   FEATURE_volume  49685 non-null  float64
 7   signal          49685 non-null  int64  
dtypes: float32(5), float64(2), int64(1)
memory usage: 2.3+ MB


##### Backtesting

In [25]:
import pandas as pd
import backtrader as bt
import backtrader.indicators as btind  # <-- NEW: For adding indicators
import backtrader.analyzers as btanalyzers
from pypfopt import EfficientFrontier, expected_returns, risk_models
import yfinance as yf

# Data Class for Predictions
class PandasPredictions(bt.feeds.PandasData):
    lines = ('signal',)
    params = (
        ('signal', -1),
        ('open', 'FEATURE_open'),
        ('high', 'FEATURE_high'),
        ('low', 'FEATURE_low'),
        ('close', 'FEATURE_close'),
        ('volume', 'FEATURE_volume')
    )

# Function to optimize weights using PyPortfolioOpt
def optimize_weights(datas):
    prices = {}
    
    for data in datas:
        s = pd.Series(data.close.array, index=data.datetime.array, name=data._name)
        prices[data._name] = s

    df = pd.DataFrame(index=prices[next(iter(prices))].index)

    for ticker, s in prices.items():
        df = df.merge(s, left_index=True, right_index=True, \
            how='left').rename(columns={s.name: ticker})

    df = df.dropna()

    mu = expected_returns.mean_historical_return(df)
    S = risk_models.sample_cov(df)
    ef = EfficientFrontier(mu, S, solver="SCS", verbose=True)
    weights = ef.max_sharpe(risk_free_rate=0.005)
    return ef.clean_weights()

# Strategy Class
class TradeAndRebalanceStrategy(bt.Strategy):
    lines = ('benchmark',)
    
    params = (
        ('stop_loss', 0.05),
        ('take_profit', 0.10),
        ('benchmark_MA_period', 21)  # <-- NEW: Moving Average period for the benchmark
    )
    
    def __init__(self):
        self.rebalance_days = 0
        self.max_loss = -0.15
        self.start_cash = self.broker.get_cash()
        self.benchmark_data = self.getdatabyname("S&P 500")
        self.benchmark_MA = btind.SimpleMovingAverage(self.benchmark_data, \
            period=self.params.benchmark_MA_period)
        self.orders = {}  # to store buy order references
        self.atr_dict = {data: btind.ATR(data, period=14) for data in self.datas if data._name != "S&P 500"}


    def log(self, txt, dt=None):
        ''' Logging function for the strategy. It logs the date and the message provided. '''
        dt = dt or self.datas[0].datetime.date(0)
        print(f"{dt.isoformat()}, {txt}")

    def notify_order(self, order):
        # If an order is completed, remove it from the orders dict
        if order.status == order.Completed:
            if order.ref in self.orders:
                del self.orders[order.ref]

    def next(self):
        # Use the moving average of the benchmark for decisions
        if self.benchmark_data.close[0] > self.benchmark_MA[0] * 1.01:  # Bullish scenario
            benchmark_trend = 1
        else:  # Bearish scenario
            benchmark_trend = -1
            
        self.log(f"Benchmark Trend: {'Bullish' if benchmark_trend == 1 else 'Bearish'}")

        benchmark_return = (self.benchmark_data.close[0] - \
            self.benchmark_data.close[-1]) / self.benchmark_data.close[-1]
        self.log(f"Benchmark Return: {benchmark_return * 100:.2f}%")

        if (self.broker.get_cash() - self.start_cash) / self.start_cash <= self.max_loss:
            return
        
        for data in self.datas:
            if data._name == "S&P 500":  # Skip the benchmark for trading signals
                continue
            atr_value = self.atr_dict[data][0] if data in self.atr_dict else 0

            # Making decisions based on benchmark's performance
            if benchmark_return > 0:  # Benchmark shows positive returns
                if data.signal[0] == 1:
                    order = self.buy(data)
                    self.orders[order.ref] = order
                    
                    # Setting dynamic stop-loss and take-profit levels using ATR
                    stop_price = data.close[0] - atr_value * 2  # Using 2 times ATR as stop loss
                    limit_price = data.close[0] + atr_value * 2  # Using 2 times ATR as take profit
                    
                    self.sell(data=data, exectype=bt.Order.Stop, price=stop_price, parent=order.ref)
                    self.sell(data=data, exectype=bt.Order.Limit, price=limit_price, parent=order.ref)
                    
            elif benchmark_return < 0:  # Benchmark shows negative returns
                self.sell(data)

        if self.rebalance_days == 0:
            weights = optimize_weights([data for data in self.datas if data._name != "S&P 500"])
            for asset, weight in weights.items():
                if weight > 0.30:
                    weights[asset] = 0.30
            
            for data in self.datas:
                if data._name == "S&P 500":
                    continue
                if data._name in weights:
                    self.order_target_percent(data, target=weights[data._name])
                else:
                    self.close(data)
            self.rebalance_days = 20
        else:
            self.rebalance_days -= 1


# Fetch S&P 500 data using yfinance
def fetch_data(ticker, start_date, end_date):
    df = yf.download(ticker, start=start_date, end=end_date)
    return df

# Assume preds is defined somewhere earlier in your code
start_date = preds.index.get_level_values(1).min()
end_date = preds.index.get_level_values(1).max()
sp500_data = fetch_data('^GSPC', start_date, end_date)

# Convert it into Backtrader format
benchmark = bt.feeds.PandasData(dataname=sp500_data, name="S&P 500")

cerebro = bt.Cerebro()
cerebro.broker.setcommission(commission=0.001)
cerebro.addanalyzer(btanalyzers.PyFolio, _name='pyfolio')
cerebro.adddata(benchmark)

data_dict = {ticker: preds.xs(ticker) for ticker in preds.index.get_level_values(0).unique()}
for ticker, data_df in data_dict.items():
    data = PandasPredictions(dataname=data_df, name=ticker)
    cerebro.adddata(data)

cerebro.addstrategy(TradeAndRebalanceStrategy)
results = cerebro.run()

# Performance Analysis
returns, positions, transactions, gross_lev \
    = results[0].analyzers.pyfolio.get_pf_items()



[*********************100%***********************]  1 of 1 completed
2022-11-30, Benchmark Trend: Bullish
2022-11-30, Benchmark Return: 3.09%
                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Oct 03 10:57:05 PM: Your problem has 251 variables, 5 constraints, and 0 parameters.
(CVXPY) Oct 03 10:57:05 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 03 10:57:05 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 03 10:57:05 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 03 10:57:

In [26]:
import quantstats as qs
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")


# Extend the Quantstats reports to consider the entire dataframe
qs.extend_pandas()

# # # # Print extended, in-depth performance stats
# qs.reports.full(returns)

# # Plotting various graphs
# qs.plots.snapshot(returns, title='Performance Snapshot')  # Snapshot of the performance
# qs.plots.monthly_heatmap(returns)  # Monthly returns heatmap
# qs.plots.yearly_returns(returns)   # Yearly returns

# plt.show()
print(f"top: {top} target: {target}")

# Print only the metrics
qs.reports.metrics(returns)

top: 250 target: TARGET_ret_fwd_frac_order
                    Strategy
------------------  ----------
Start Period        2022-11-01
End Period          2023-08-03
Risk-Free Rate      0.0%
Time in Market      89.0%

Cumulative Return   14.86%
CAGR﹪              13.53%

Sharpe              1.27
Prob. Sharpe Ratio  86.35%
Sortino             1.91
Sortino/√2          1.35
Omega               1.23

Max Drawdown        -8.42%
Longest DD Days     51

Gain/Pain Ratio     0.23
Gain/Pain (1M)      1.16

Payoff Ratio        1.12
Profit Factor       1.23
Common Sense Ratio  1.29
CPC Index           0.72
Tail Ratio          1.05
Outlier Win Ratio   2.6
Outlier Loss Ratio  2.7

MTD                 -2.25%
3M                  3.65%
6M                  9.72%
YTD                 22.9%
1Y                  14.86%
3Y (ann.)           13.53%
5Y (ann.)           13.53%
10Y (ann.)          13.53%
All-time (ann.)     13.53%

Avg. Drawdown       -2.91%
Avg. Drawdown Days  16
Recovery Factor     1.75
Ulcer Ind