##### Import

In [None]:
import warnings
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from tqdm import tqdm
import shap
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.pipeline import Pipeline
import os
import gc
import sys

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice

In [None]:
# from pathlib import Path
# import pandas as pd
# from utils import rank_stocks_and_quantile
# # UNSEEN_KEY = '/data/YEAR_20220803_20230803'
# top = 250  # parameters -> papermill
# DATA_STORE = Path(f'data/{top}_dataset.h5')
# with pd.HDFStore(DATA_STORE) as store:
#     # unseen = store[UNSEEN_KEY]
#     print(store.keys())

In [None]:
from utils import rank_stocks_and_quantile

top = 250 # parameters -> papermill

DATA_STORE = Path(f'data/{top}_dataset.h5')
dataset_key = '/data/YEAR_20200930_20220802'
# dataset_key = None

with pd.HDFStore(DATA_STORE) as store:
    dataset = store[dataset_key]
    # dataset = store['/data/YEAR_20161115_20181022']
    dataset = rank_stocks_and_quantile(dataset, TARGET_col='TARGET_ret_fwd_frac_order')

In [None]:
dataset.info()

In [None]:
# Define the features and label columns
features = [col for col in dataset.columns if col.startswith('FEATURE_')]
label = 'TARGET_ret_fwd_frac_order_rank_quantiled'

# Print the number of features and the label
print(f"Number of features: {len(features)}")
print(f"Label: {label}")

# Remove timezone information from the date index
dataset.index.set_levels(dataset.index.levels[0].tz_localize(None), level=0, inplace=True)

### Since we have unseen dataset
# # Get unique dates and sort them
# unique_dates = dataset.index.get_level_values('date').unique().sort_values()

# # Define the look-ahead gap
# look_ahead = 1

# # Split dates for training and testing with a gap
# train_dates = unique_dates[:-21-look_ahead]
# test_dates = unique_dates[-21:]

# # Split the dataset using the train and test dates
# train_data = dataset[dataset.index.get_level_values('date').isin(train_dates)]
# test_data = dataset[dataset.index.get_level_values('date').isin(test_dates)]


In [None]:
def get_fi(model):
    fi = model.feature_importance(importance_type='gain')
    return (pd.Series(fi / fi.sum(),
                index=model.feature_name()))
                
def ic_lgbm(preds, train_data):
    """Custom IC eval metric for lightgbm"""
    is_higher_better = True
    return 'ic', spearmanr(preds, train_data.get_label())[0], \
        is_higher_better

def sharpe_ratio_lgbm(preds, train_data):
    """Custom Sharpe ratio eval metric for lightgbm that calculates daily Spearman correlations."""
    labels = train_data.get_label()
    # print(type(preds))
    # print(type(train_data))
    # print(labels)
    # Assuming the data index is a MultiIndex with date as the first level
    if not isinstance(train_data.data.index, pd.MultiIndex):
        raise ValueError("Expecting a MultiIndex with date as the first level")

    # Group by the first level of the MultiIndex (date) and compute the Spearman correlation for each group
    grouped_labels = pd.Series(labels, \
        index=train_data.data.index).groupby(level=0)
    # print(len(grouped_labels))
    grouped_preds = pd.Series(preds, \
        index=train_data.data.index).groupby(level=0)
    # print(len(grouped_preds))

    daily_scores = []
    for (_, actuals_for_day), (_, preds_for_day) in zip(grouped_labels, grouped_preds):
        score_for_day = spearmanr(actuals_for_day, preds_for_day)[0]
        if np.isnan(score_for_day):
            score_for_day = 0
        daily_scores.append(score_for_day)

    # Calculate the Sharpe ratio
    sharpe_ratio = np.mean(daily_scores) / (np.std(daily_scores) + 1e-9)  
    # added epsilon to avoid division by zero

    return 'sharpe_ratio', sharpe_ratio, True


def custom_eval_metrics(preds, train_data):
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    labels = train_data.get_label()
    mae = mean_absolute_error(labels, preds)
    mse = mean_squared_error(labels, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, preds)

    return [("MAE", mae, False), 
            ("MSE", mse, False), 
            ("RMSE", rmse, False), 
            ("R2", r2, True)]

In [None]:
def metrics_on_fold(era_scores):
    era_scores = pd.Series(era_scores)
    
    # Calculate metrics
    mean_correlation = np.mean(era_scores)
    std_deviation = np.std(era_scores)
    sharpe_ratio = mean_correlation / std_deviation
    max_dd = (era_scores.cummax() - era_scores).max()

    # Smart Sharpe
    smart_sharpe = mean_correlation / (std_deviation + np.std(era_scores.diff()))
    
    # Autocorrelation
    autocorrelation = era_scores.autocorr()

    metrics = pd.Series({
        'mean_correlation': mean_correlation,
        'std_deviation': std_deviation,
        'sharpe_ratio': sharpe_ratio,
        'smart_sharpe': smart_sharpe,
        'autocorrelation': autocorrelation,
        'max_dd': max_dd,
        'min_correlation': era_scores.min(),
        'max_correlation': era_scores.max(),
    })

    # Cleanup
    _ = gc.collect()
    
    return metrics

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import mlflow
import mlflow.lightgbm
from optuna.integration import LightGBMPruningCallback
from scipy.stats import spearmanr
from utils import CustomBackwardMultipleTimeSeriesCV
import warnings

warnings.filterwarnings('ignore')

evals_result = {}

def get_categoricals(dataset, threshold):
    return [col for col in dataset.columns if \
            dataset[col].nunique() < threshold and \
            dataset[col].ge(0).all() and col.startswith("FEATURE_")]

def objective(trial, data, features, cv):
    # Dynamic categoricals based on the trial's suggested threshold
    cat_threshold = trial.suggest_int('cat_threshold', 5, 50)
    categoricals = [col for col in data.columns if data[col].nunique() < cat_threshold 
                    and data[col].ge(0).all() and col.startswith("FEATURE_")]

    params = {
        'boosting': 'gbdt',
        'objective': 'regression',
        'verbose': -1,
        'metric': 'None',
        'device': 'gpu',
        'num_leaves': trial.suggest_int('num_leaves', 30, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
    }

    early_stopping = lgb.early_stopping(stopping_rounds=500, \
        verbose=True, first_metric_only=True)
    daily_scores_in_fold = []

    for train_idx, val_idx in cv:
        train_features = data.loc[train_idx, features]
        train_labels = data.loc[train_idx, label]
        lgb_train = lgb.Dataset(data=train_features, label=train_labels, 
                        categorical_feature=categoricals, free_raw_data=False)

        val_features = data.loc[val_idx, features]
        val_labels = data.loc[val_idx, label]
        lgb_val = lgb.Dataset(data=val_features, label=val_labels, 
                categorical_feature=categoricals, free_raw_data=False)

        model = lgb.train(params=params,
                          train_set=lgb_train,
                          num_boost_round=5000,
                          valid_sets=[lgb_train, lgb_val],
                          valid_names=['train', 'valid_0'],
                          feval=[sharpe_ratio_lgbm, custom_eval_metrics],
                          callbacks=[lgb.record_evaluation(evals_result),
                                     early_stopping,
                                     LightGBMPruningCallback(trial, 'sharpe_ratio')])
        
        daily_scores_in_fold.extend(evals_result['valid_0']['sharpe_ratio'])

    metrics = metrics_on_fold(daily_scores_in_fold)
    score = metrics['sharpe_ratio']

    # Log parameters, metrics, and evaluation results to MLflow
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # Log metrics from metrics dictionary
        mlflow.log_metrics(metrics)
        
        # Logging each score and metric in evals_result
        for valid_set, metrics_dict in evals_result.items():
            for metric, values in metrics_dict.items():
                for idx, value in enumerate(values):
                    metric_name = f"{valid_set}_{metric}_{idx}"
                    mlflow.log_metric(metric_name, value)

        # Log the average sharpe ratio
        mlflow.log_metric("avg_sharpe_ratio_across_folds", score)

        # mlflow.lightgbm.log_model(model, "lightgbm_model")

    score = metrics['sharpe_ratio']
    # print(score)
    return score if not np.isnan(score) else 1e-9

In [None]:
import os
import optuna

cv = CustomBackwardMultipleTimeSeriesCV(dataset, train_period_length=142, 
                                        test_period_length=5, 
                                        lookahead=1, 
                                        date_idx='date')

def progress_bar(study, trial, n_trials):
    progress = (trial.number + 1) / n_trials
    best_trial_msg = ""
    if study.best_trial is not None:
        best_trial_msg = f"Best is trial {study.best_trial.number} \
            with value: {study.best_trial.value}."
    print(f'Trial {trial.number + 1}/{n_trials} finished with value: \
        {trial.value} and parameters: {trial.params}. {best_trial_msg}')

# Check if 'study' directory exists, if not, create it.
if not os.path.exists("study"):
    os.makedirs("study")

# Use SQLite to store optimization results.
# The study results are stored in the "study" folder as "study.db".
storage_name = "sqlite:///study/study.db"

# Name of the study. This should be consistent for resuming the study later.
study_name = "lgbm_optimization"

# Try to load the study. If it doesn't exist, create a new one.
study = optuna.create_study(study_name=study_name,
                            storage=storage_name,
                            direction='maximize',
                            load_if_exists=True, 
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=10, \
                            n_warmup_steps=5))

n_trials = 5
study.optimize(lambda trial: objective(trial, dataset, features, cv), 
               n_trials=n_trials, 
               callbacks=[lambda study, trial: progress_bar(study, trial, n_trials)])

# Printing the optimization results
print(f'Best trial score: {study.best_trial.value}')
print('Best hyperparameters:')
for key, value in study.best_trial.params.items():
    print(f'{key}: {value}')

best_params = study.best_params
print("Best parameters found by Optuna:")
print(best_params)

# Remove the study database
os.remove("study/study.db")
print("Database has been deleted.")

In [None]:
evals_result['valid_0']

In [None]:
# Assuming evals_result contains MAE, MSE, RMSE, R2 as well
metrics = ['sharpe_ratio', 'MAE', 'MSE', 'RMSE', 'R2']

fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4 * len(metrics)))

for ax, metric in zip(axes, metrics):
    cv_result = pd.DataFrame({'Train Set': evals_result['train'][metric], 
                              'Validation Set': evals_result['valid_0'][metric]})
    
    ax1 = ax
    ax2 = ax1.twinx()  # instantiate a second axes sharing the same x-axis
    
    ax1.plot(cv_result.index, cv_result['Train Set'], 'g-', label=f'Train Set {metric}')
    ax2.plot(cv_result.index, cv_result['Validation Set'], 'b-', label=f'Validation Set {metric}')
    
    ax1.set_ylabel(f'Train Set {metric}', color='g')
    ax2.set_ylabel(f'Validation Set {metric}', color='b')
    
    if metric != 'R2':
        ax1.axvline(cv_result['Validation Set'].idxmin(), c='k', ls='--', lw=1)
    else:
        ax1.axvline(cv_result['Validation Set'].idxmax(), c='k', ls='--', lw=1)

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()


In [14]:
# # Extract cat_threshold from best_params
# cat_threshold = best_params.get('cat_threshold', 50)  # default to 50 if not in best_params

# Extract cat_threshold from best_params
cat_threshold = best_params.pop('cat_threshold', 50)  # default to 50 if not in best_params


# Determine the categorical columns based on cat_threshold
categoricals = [col for col in dataset.columns if dataset[col].nunique() < cat_threshold 
                and dataset[col].ge(0).all() and col.startswith("FEATURE_")]

# Create the training dataset
lgb_train_all = lgb.Dataset(data=dataset[features], label=dataset[label], 
                            categorical_feature=categoricals, free_raw_data=False)

best_params['force_col_wise'] = True
# Get the best iteration from your previous training
optimal_boosting_rounds = evals_result['valid_0']['sharpe_ratio'].index(max(evals_result['valid_0']['sharpe_ratio']))

best_model = lgb.train(params=best_params,
                       train_set=lgb_train_all,
                       num_boost_round=optimal_boosting_rounds,  # Use the optimal number of rounds
                       feval=sharpe_ratio_lgbm,
                       callbacks=[lgb.record_evaluation(evals_result)])


KeyError: 'valid_0'

In [None]:
from pathlib import Path

# Define the models folder path
models = Path("./models")

# Ensure the folder exists
models.mkdir(exist_ok=True)

# Ensure that dataset_key doesn't contain invalid characters like slashes
clean_dataset_key = dataset_key.replace("/", "_")

# Formulate the clean save path
save_path = models / f"{clean_dataset_key}_best_model.txt"

# Try saving again
best_model.save_model(save_path)
print(f"Model saved to {save_path}")

#### Test on unseen data

In [None]:
test_features = test_data[features]
test_labels = test_data[label]

model_path = "/home/sayem/Desktop/Project/models/_data_YEAR_20200929_20220902_best_model.txt"
# Load the model
best_model = lgb.Booster(model_file=model_path)

y_pred = best_model.predict(test_features)

preds = test_labels.reset_index(name=\
    'actual').assign(predicted=y_pred).set_index(['date', 'ticker'])

# Rename columns to add 'feature_' prefix
cols_to_rename = ['open', 'high', 'low', 'close', 'volume']
new_col_names = ["FEATURE_" + col for col in cols_to_rename]
rename_dict = dict(zip(cols_to_rename, new_col_names))

test_data_renamed = test_data.rename(columns=rename_dict)

# Using the 'merge' method to join on MultiIndex levels 'date' and 'ticker'
preds = preds.reset_index().merge(test_data_renamed[new_col_names].reset_index(), 
                                  on=['ticker', 'date'], 
                                  how='left')

### Only select columns of interest
preds = preds[['date', 'ticker', 'actual', 'predicted'] \
    + new_col_names].set_index(['ticker', 'date'])


In [None]:
preds.info()

In [None]:
import pandas as pd
import alphalens as al

def generate_alphalens_tearsheet(df: pd.DataFrame, label_col: str, price_col: str) -> None:
    """
    Generate the Alphalens full tearsheet given the input dataframe, 
    label column, and price data column.
    """
    # Extract the factor and trade_prices series
    factor = df[label_col]
    trade_prices = df[price_col]
    
    # Remove duplicated indices from factor and trade_prices
    factor = factor[~factor.index.duplicated(keep='first')]
    trade_prices = trade_prices[~trade_prices.index.duplicated(keep='first')]
    
    # Convert the trade_prices series into unstacked format
    trade_prices_unstacked = trade_prices.unstack(level='ticker')
    
    # Ensure the factor's DatetimeIndex level has the same timezone as trade_prices_unstacked
    if isinstance(factor.index, pd.MultiIndex):
        level_0 = factor.index.get_level_values(0)
        if hasattr(level_0, 'tz') and level_0.tz is not None:
            if level_0.tz != trade_prices_unstacked.index.tz:
                factor.index.set_levels(level_0.tz_convert(trade_prices_unstacked.index.tz), \
                    level=0, inplace=True)
        factor = factor.swaplevel().sort_index()
    else:
        if hasattr(factor.index, 'tz') and factor.index.tz is not None:
            if factor.index.tz != trade_prices_unstacked.index.tz:
                factor.index = factor.index.tz_convert(trade_prices_unstacked.index.tz)

    # Create the factor_data dataframe with forward returns
    alphalen_analysis = al.utils.get_clean_factor_and_forward_returns(
        factor=factor,
        prices=trade_prices_unstacked,
        periods=[1, 5, 10],
        max_loss=0.9
    )

    factor_returns = al.performance.factor_returns(alphalen_analysis)
    sharpe_ratios = factor_returns.mean() / factor_returns.std()

    print("\nSharpe Ratios:\n", sharpe_ratios)
    alphalen_analysis = alphalen_analysis[~alphalen_analysis.index.duplicated(keep='first')]
    alphalen_analysis = alphalen_analysis.groupby(level=[0, 1]).mean()
    return alphalen_analysis

target = 'actual'
alphalens_analysis = generate_alphalens_tearsheet(preds, \
    label_col=target, price_col='FEATURE_close')

import alphalens as al
al.tears.create_full_tear_sheet(alphalens_analysis)

In [None]:
# import pandas as pd
# import alphalens as al

# def generate_alphalens_tearsheet(df: pd.DataFrame, \
#     label_col: str, price_col: str) -> None:
#     """
#     Generate the Alphalens full tearsheet given the input dataframe, 
#     label column, and price data column.
#     """
#     # Extract the factor and trade_prices series
#     factor = df[label_col]
#     trade_prices = df[price_col]
    
#     # Handle duplicate indices in trade_prices
#     trade_prices = trade_prices[~trade_prices.index.duplicated(keep='first')]
    
#     # Convert the trade_prices series into unstacked format
#     trade_prices_unstacked = trade_prices.unstack(level='ticker')
    
#     # Ensure the factor's DatetimeIndex level has the same timezone as trade_prices_unstacked
#     if isinstance(factor.index, pd.MultiIndex):
#         level_0 = factor.index.get_level_values(0)
#         if hasattr(level_0, 'tz') and level_0.tz is not None:
#             if level_0.tz != trade_prices_unstacked.index.tz:
#                 factor.index.set_levels(level_0.tz_convert(trade_prices_unstacked.index.tz), level=0, inplace=True)
#         factor = factor.swaplevel().sort_index()
#     else:
#         if hasattr(factor.index, 'tz') and factor.index.tz is not None:
#             if factor.index.tz != trade_prices_unstacked.index.tz:
#                 factor.index = factor.index.tz_convert(trade_prices_unstacked.index.tz)

#     # Create the factor_data dataframe with forward returns
#     alphalen_analysis = al.utils.get_clean_factor_and_forward_returns(
#         factor=factor,
#         prices=trade_prices_unstacked,
#         periods=[1, 5, 10],
#         max_loss=0.5
#     )

#     factor_returns = al.performance.factor_returns(alphalen_analysis)
#     sharpe_ratios = factor_returns.mean() / factor_returns.std()

#     print("\nSharpe Ratios:\n", sharpe_ratios)
#     alphalen_analysis = alphalen_analysis[~alphalen_analysis.index.duplicated(keep='first')]
#     alphalen_analysis = alphalen_analysis.groupby(level=[0, 1]).mean()
#     return alphalen_analysis

# target = 'predicted'
# alphalens_analysis = generate_alphalens_tearsheet(preds, label_col=target, price_col='FEATURE_close')

# import alphalens as al
# al.tears.create_full_tear_sheet(alphalens_analysis)

In [None]:
def daily_spearman(group):
    return spearmanr(group['actual'], group['predicted'])[0]

daily_correlations = preds.groupby('date').apply(daily_spearman)

In [None]:
# Calculate the mean and standard deviation of daily correlations
mean_daily_correlation = daily_correlations.mean()
std_daily_correlation = daily_correlations.std()

# Calculate Sharpe ratio for each date
papermill_era_scores = daily_sharpe_ratios = (daily_correlations - \
    mean_daily_correlation) / std_daily_correlation

papermill_era_scores_df = papermill_era_scores.to_frame()
papermill_era_scores_df.columns = papermill_era_scores_df.columns.astype(str)
sb.glue("papermill_era_scores", papermill_era_scores_df, display=True)

# papermill_era_scores_list = papermill_era_scores.tolist()
# sb.glue("papermill_era_scores", papermill_era_scores_list)


In [None]:
import matplotlib.pyplot as plt

# Create a list of colors based on the sign of the Sharpe Ratios
colors = ['blue' if value > 0 else 'red' for value in daily_sharpe_ratios]

plt.figure(figsize=(12,6))
daily_sharpe_ratios.plot(kind='bar', color=colors)
plt.title('Daily Sharpe Ratios')
plt.xlabel('Date')
plt.ylabel('Sharpe Ratio')
plt.grid(axis='y')
plt.tight_layout()
plt.axhline(y=0, color='black', linestyle='-')  # Here's where we add the horizontal line at y=0
plt.xticks(rotation=45)  # rotates the x-axis labels for better visibility
plt.show()

In [None]:
plot_dir = Path("plots")
plot_dir.mkdir(exist_ok=True)
plot_path = plot_dir / f"sharpe_ratios_{key}.png"
plt.savefig(plot_path)
plt.close()

papermill_plot_path_str = str(plot_path)  # Convert to string
sb.glue("papermill_plot_path", papermill_plot_path_str, display=True)  # Glue the string

In [None]:
lr_r, lr_p = spearmanr(preds.actual, preds.predicted)
print(f'Information Coefficient (overall): {lr_r:.3%} (p-value: {lr_p:.8%})')

# Return the Information Coefficient and its p-value
information_coefficient = lr_r
p_value = lr_p

# information_coefficient = papermill_information_coefficient, p_value = papermill_p_value
sb.glue("information_coefficient", information_coefficient, display=True)
sb.glue("p_value", p_value, display=True)

In [None]:
# papermill_plot_path_str = str(plot_path)  # Convert to string
# sb.glue("papermill_plot_path", papermill_plot_path_str)  # Glue the string

# sb.glue("information_coefficient", information_coefficient)
# sb.glue("p_value", p_value)