##### Import

In [19]:
import os
import sys
import warnings
import gc
from pathlib import Path

import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from tqdm import tqdm
import shap

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice

#### Test on unseen data

In [20]:
from pathlib import Path
import pandas as pd
import lightgbm as lgb
from utils import rank_stocks_and_quantile
import pickle

def load_model_and_parameters(path):
    with open(path, 'rb') as file:
        loaded_object = pickle.load(file)
    return loaded_object['model'], loaded_object['params']


def read_and_process_data(store_path, key, target_substring):
    with pd.HDFStore(store_path) as store:
        data = store[key]
    data = rank_stocks_and_quantile(data, target_substring=target_substring)
    # data.index.set_levels(data.index.levels[0].tz_localize(None), level=0, inplace=True)
    new_index = data.index.set_levels(data.index.levels[0].tz_localize(None), level=0)
    data.set_index(new_index, inplace=True)

    return data

# def filter_by_date(data, look_ahead):
#     unique_dates = data.index.get_level_values('date').unique().sort_values()
#     cut_off_date = unique_dates[0] - pd.Timedelta(days=look_ahead)
#     return data[data.index.get_level_values('date') > cut_off_date]

def filter_by_date(data, look_ahead):
    unique_dates = data.index.get_level_values('date').unique().sort_values()
    
    # Ensure that there are enough unique dates for the look_ahead value
    assert len(unique_dates) > look_ahead, f"Insufficient unique dates for a look_ahead value of {look_ahead}"
    
    cut_off_date = unique_dates[look_ahead]
    
    return data[data.index.get_level_values('date') > cut_off_date]

def synchronize_and_merge_predictions(data, features, model, label):
    y_pred = model.predict(data[features])
    synchronized_labels = data[label][data[label].index.isin(data[features].index)]
    synchronized_y_pred = y_pred[:len(synchronized_labels)]

    preds = synchronized_labels.reset_index(name='actual').assign(predicted=\
        synchronized_y_pred).set_index(['date', 'ticker'])

    # Renaming and merging
    cols_to_rename = ['open', 'high', 'low', 'close', 'volume']
    new_col_names = ["FEATURE_" + col for col in cols_to_rename]
    rename_dict = dict(zip(cols_to_rename, new_col_names))

    data_renamed = data.rename(columns=rename_dict)
    preds = preds.reset_index().merge(data_renamed[new_col_names].reset_index(), \
        on=['ticker', 'date'], how='left')
    return preds[['date', 'ticker', 'actual', 'predicted'] \
        + new_col_names].set_index(['ticker', 'date'])

# Constants and Initial Setup
TOP = 250
UNSEEN_KEY = '/data/YEAR_20220803_20230803'
UNSEEN_STORE = Path(f'data/{TOP}_unseen_dataset.h5')
MODEL_PATH = "/home/sayem/Desktop/Project/models/250_combined_model_and_params_TARGET_ret_fwd_01d_rank_quantiled.pkl"

# Main Execution
best_model, params = load_model_and_parameters(MODEL_PATH)
test_data = read_and_process_data(UNSEEN_STORE, UNSEEN_KEY, 'TARGET_ret_fwd_')
filtered_test_data = filter_by_date(test_data, params.get('look_ahead', 1))
label = f'TARGET_ret_fwd_{params.get("look_ahead", 1):02d}d_rank_quantiled'
features = [col for col in test_data.columns if col.startswith('FEATURE_')]
preds = synchronize_and_merge_predictions(filtered_test_data, features, best_model, label)

preds.info()

# # Assuming your DataFrame is named `df`
# df = preds.swaplevel(0, 1).sort_index()
# print(df.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 65484 entries, ('AA', Timestamp('2022-08-05 00:00:00')) to ('ZTS', Timestamp('2023-08-03 00:00:00'))
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   actual          65484 non-null  float64
 1   predicted       65484 non-null  float64
 2   FEATURE_open    65484 non-null  float32
 3   FEATURE_high    65484 non-null  float32
 4   FEATURE_low     65484 non-null  float32
 5   FEATURE_close   65484 non-null  float32
 6   FEATURE_volume  65484 non-null  float64
dtypes: float32(4), float64(3)
memory usage: 2.8+ MB


In [None]:
# from scipy.stats import spearmanr

# def calculate_daily_spearman_correlation(group):
#     """Calculate the daily Spearman correlation between actual and predicted values."""
#     return spearmanr(group['actual'], group['predicted'])[0]

# # Compute the daily Spearman correlation for each date
# spearman_correlations_daily = preds.groupby('date').apply(calculate_daily_spearman_correlation)

# # Calculate the average Spearman correlation over all dates
# average_spearman_correlation = spearman_correlations_daily.mean()

# # Calculate the Sharpe ratio for each date
# daily_sharpe_ratios = spearman_correlations_daily / spearman_correlations_daily.std()

# # Convert the Sharpe ratios to a DataFrame
# sharpe_ratios_dataframe = daily_sharpe_ratios.to_frame()
# sharpe_ratios_dataframe.columns = sharpe_ratios_dataframe.columns.astype(str)
# sb.glue("sharpe_ratios_per_day", sharpe_ratios_dataframe, display=False)

from scipy.stats import spearmanr

def calculate_daily_spearman_correlation(group):
    """Calculate the daily Spearman correlation between actual and predicted values."""
    return spearmanr(group['actual'], group['predicted'])[0]

# Compute the daily Spearman correlation for each date
spearman_correlations_daily = preds.groupby('date').apply(calculate_daily_spearman_correlation)

# Calculate the Sharpe ratio for each date
daily_sharpe_ratios = spearman_correlations_daily / spearman_correlations_daily.std()

# Log the daily Sharpe ratios for later analysis (if required)
sharpe_ratios_dataframe = daily_sharpe_ratios.to_frame()
sharpe_ratios_dataframe.columns = sharpe_ratios_dataframe.columns.astype(str)
# Assuming you are using scrapbook for logging, else adapt this line accordingly
sb.glue("sharpe_ratios_per_day", sharpe_ratios_dataframe, display=False)

# Now, calculate the Sharpe of the Sharpe and print it
sharpe_of_sharpe = daily_sharpe_ratios.mean() / daily_sharpe_ratios.std()
print(f"Sharpe of the daily Sharpe ratios: {sharpe_of_sharpe:.4f}")

In [22]:
# import matplotlib.pyplot as plt

# # Create a list of colors based on the sign of the Sharpe Ratios
# colors = ['blue' if value > 0 else 'red' for value in daily_sharpe_ratios]

# plt.figure(figsize=(12,6))
# daily_sharpe_ratios.plot(kind='bar', color=colors)
# plt.title('Daily Sharpe Ratios')
# plt.xlabel('Date')
# plt.ylabel('Sharpe Ratio')
# plt.grid(axis='y')
# plt.tight_layout()
# plt.axhline(y=0, color='black', linestyle='-')  # Add a horizontal line at y=0
# plt.xticks(rotation=45)  # Rotate the x-axis labels for better visibility
# plt.show()

In [23]:
from scipy.stats import spearmanr

lr_r, lr_p = spearmanr(preds.actual, preds.predicted)
print(f'Information Coefficient (overall): {lr_r:.3%} (p-value: {lr_p:.8%})')

# Return the Information Coefficient and its p-value
information_coefficient = lr_r
p_value = lr_p

# information_coefficient = papermill_information_coefficient, p_value = papermill_p_value
sb.glue("information_coefficient", information_coefficient, display=True)
sb.glue("p_value", p_value, display=True)

Information Coefficient (overall): 4.471% (p-value: 0.00000000%)


0.044710549866929974

2.4380537218931725e-30

In [24]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 65484 entries, ('AA', Timestamp('2022-08-05 00:00:00')) to ('ZTS', Timestamp('2023-08-03 00:00:00'))
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   actual          65484 non-null  float64
 1   predicted       65484 non-null  float64
 2   FEATURE_open    65484 non-null  float32
 3   FEATURE_high    65484 non-null  float32
 4   FEATURE_low     65484 non-null  float32
 5   FEATURE_close   65484 non-null  float32
 6   FEATURE_volume  65484 non-null  float64
dtypes: float32(4), float64(3)
memory usage: 2.8+ MB


In [25]:
def add_quantile_signals(df, col_name='predicted', buy_threshold=0.8, sell_threshold=0.1):
    buy_cutoff = df[col_name].quantile(buy_threshold)
    sell_cutoff = df[col_name].quantile(sell_threshold)
    
    # Create a new column for signals
    df['signal'] = 0  # Neutral by default
    df.loc[df[col_name] >= buy_cutoff, 'signal'] = 1  # Buy
    df.loc[df[col_name] <= sell_cutoff, 'signal'] = -1  # Sell
    return df

# Apply the function ## Testing
preds = add_quantile_signals(preds.copy(), buy_threshold=0.95, sell_threshold=0.05)

In [26]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 65484 entries, ('AA', Timestamp('2022-08-05 00:00:00')) to ('ZTS', Timestamp('2023-08-03 00:00:00'))
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   actual          65484 non-null  float64
 1   predicted       65484 non-null  float64
 2   FEATURE_open    65484 non-null  float32
 3   FEATURE_high    65484 non-null  float32
 4   FEATURE_low     65484 non-null  float32
 5   FEATURE_close   65484 non-null  float32
 6   FEATURE_volume  65484 non-null  float64
 7   signal          65484 non-null  int64  
dtypes: float32(4), float64(3), int64(1)
memory usage: 3.3+ MB


##### Backtesting

In [27]:
# preds = preds.head(10**4)

In [28]:
import pandas as pd
import backtrader as bt
import backtrader.indicators as btind
import backtrader.analyzers as btanalyzers
from pypfopt import EfficientFrontier, expected_returns, risk_models
import numpy as np
import yfinance as yf

import quantstats as qs
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
qs.extend_pandas()

# Data Class for Predictions
class PandasPredictions(bt.feeds.PandasData):
    lines = ('signal',)
    params = (
        ('signal', -1),
        ('open', 'FEATURE_open'),
        ('high', 'FEATURE_high'),
        ('low', 'FEATURE_low'),
        ('close', 'FEATURE_close'),
        ('volume', 'FEATURE_volume')
    )

# Function to optimize weights using PyPortfolioOpt
def optimize_weights(datas):
    prices = {}
    for data in datas:
        s = pd.Series(data.close.array, index=data.datetime.array, name=data._name)
        s = s.groupby(s.index).first()  # This will drop duplicate datetime entries, if any
        prices[data._name] = s
    df = pd.concat(prices, axis=1).dropna()

    mu = expected_returns.mean_historical_return(df)
    S_original = risk_models.sample_cov(df)
    S = (1 - 0.05) * S_original + 0.05 * np.eye(S_original.shape[0])
    S += 1e-6 * np.eye(S_original.shape[0])
    
    ef = EfficientFrontier(mu, S, solver="SCS", verbose=True)
    try:
        weights = ef.max_sharpe(risk_free_rate=0.005)
    except ValueError as e:
        if "expected return exceeding the risk-free rate" in str(e):
            print("Using alternative method due to low expected returns.")
            weights = ef.min_volatility()  # or any other method you'd prefer
        else:
            raise e
    # weights = ef.max_sharpe(risk_free_rate=0.005)
    return weights

# Strategy Class
class TradeAndRebalanceStrategy(bt.Strategy):    
    params = {'stop_loss': 0.05, 'take_profit': 0.10}
    
    def __init__(self):
        self.rebalance_days = 0
        self.max_loss = -0.15
        self.start_cash = self.broker.get_cash()
        self.orders = {}  
        self.atr_dict = {data: btind.ATR(data) for data in self.datas}

    def notify_order(self, order):
        if order.status == order.Completed and order.ref in self.orders:
            del self.orders[order.ref]

    def next(self):        
        for data in self.datas:
            atr_value = self.atr_dict.get(data, 0)[0]
            if data.signal[0] == 1:
                order = self.buy(data)
                self.orders[order.ref] = order
                stop_price = data.close[0] - atr_value * 2
                limit_price = data.close[0] + atr_value * 2
                self.sell(data=data, exectype=bt.Order.Stop, price=stop_price, parent=order.ref)
                self.sell(data=data, exectype=bt.Order.Limit, price=limit_price, parent=order.ref)
            elif data.signal[0] == -1:
                self.sell(data)

        if not self.rebalance_days:
            weights = optimize_weights(self.datas)
            weights = {k: min(v, 0.30) for k, v in weights.items()}
            for data in self.datas:
                self.order_target_percent(data, target=weights.get(data._name, 0))
            self.rebalance_days = 20
        else:
            self.rebalance_days -= 1

# Setup
cerebro = bt.Cerebro()
cerebro.broker.setcommission(commission=0.001)
cerebro.addanalyzer(btanalyzers.PyFolio, _name='pyfolio')

data_dict = {ticker: preds.xs(ticker) for ticker in \
    preds.index.get_level_values(0).unique()}
for ticker, data_df in data_dict.items():
    data = PandasPredictions(dataname=data_df, name=ticker)
    cerebro.adddata(data)

cerebro.optstrategy(TradeAndRebalanceStrategy, \
    stop_loss=[0.03, 0.05, 0.07], take_profit=[0.10, 0.12, 0.14])

# cerebro.optstrategy(TradeAndRebalanceStrategy, \
#     stop_loss=[0.03], take_profit=[0.10])

# cerebro.addstrategy(TradeAndRebalanceStrategy)
# results = cerebro.run()

results = cerebro.run(maxcpus=1)

                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Oct 05 10:30:26 AM: Your problem has 251 variables, 5 constraints, and 0 parameters.
(CVXPY) Oct 05 10:30:26 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 05 10:30:26 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 05 10:30:26 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 05 10:30:26 AM: Compiling problem (target solver=SCS).
(CVXPY) Oct 05 10:30:26 AM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing ->

In [None]:
import os
import pickle
# Ensure the folder 'backtest_results' exists, if not, create it
if not os.path.exists('backtest_results'):
    os.makedirs('backtest_results')

# Save results into the folder
with open('backtest_results/results.pkl', 'wb') as f:
    pickle.dump(results, f)

del results

# Read and open the saved results.pkl file
with open('backtest_results/results.pkl', 'rb') as f:
    results = pickle.load(f) 

In [None]:
# Your initial setup remains the same...

# This dictionary will store the Sharpe ratio and drawdown for each parameter combination
metrics_dict = {}

for res in results:
    for r in res:
        # Extract the parameters
        params_str = ', '.join([f"{k}={v}" for k, v in r.params._getitems()])
        
        returns, positions, transactions, gross_lev = r.analyzers.pyfolio.get_pf_items()
        
        # Convert returns series to daily returns for Quantstats
        daily_returns = returns.resample('D').sum()

        # Compute metrics, including the Sharpe ratio and drawdown
        sharpe_ratio = qs.stats.sharpe(daily_returns)
        drawdown = qs.stats.max_drawdown(daily_returns)


        # Store the Sharpe ratio and drawdown in the metrics_dict
        metrics_dict[params_str] = (sharpe_ratio, drawdown)

        # Optionally print all metrics for review
        qs.reports.metrics(daily_returns)

# Find the best combination based on Sharpe and drawdown values
best_params = max(metrics_dict.keys(), key=lambda k: (metrics_dict[k][0], \
    -metrics_dict[k][1]))  # Prioritizing sharpe first, then drawdown
best_sharpe = metrics_dict[best_params][0]
best_drawdown = metrics_dict[best_params][1]

print(f"The best parameters in terms of Sharpe ratio and lowest drawdown are: \
    {best_params} with a Sharpe ratio of {best_sharpe:.2f} and a max drawdown of {best_drawdown:.2f}")


In [None]:
import quantstats as qs
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")


# Extend the Quantstats reports to consider the entire dataframe
qs.extend_pandas()

# # # # # # Print extended, in-depth performance stats
# qs.reports.full(returns)

# # Plotting various graphs
# qs.plots.snapshot(returns, title='Performance Snapshot')  # Snapshot of the performance
# qs.plots.monthly_heatmap(returns)  # Monthly returns heatmap
# qs.plots.yearly_returns(returns)   # Yearly returns
# plt.show()
# print(f"top: {top} target: {target}")

# Print only the metrics
qs.reports.metrics(returns)