## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Backtesting Equally Weighted Market Portfolio

In [None]:
df_market = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_market_monthly_returns = df_market.groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_market_monthly_returns['growth_factor'] = 1 + df_market_monthly_returns['R1M_Usd']
df_market_monthly_returns['equity_curve'] = df_market_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_market_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_market_monthly_returns))) - 1

0.08005470346536026

In [None]:
(df_market_monthly_returns["R1M_Usd"].mean() / df_market_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.633808935267311

## Backtesting Hard Predictions Strategy with All Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_all_features = pd.read_csv("../data/inference_xgboost_untuned_all_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_all_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1,0.443008,0.556992
1,2015-04-30,-0.106,1,0.407426,0.592574
2,2015-05-31,-0.185,1,0.417241,0.582759
3,2015-06-30,-0.418,1,0.392114,0.607886
4,2015-07-31,0.575,1,0.392114,0.607886
...,...,...,...,...,...
70484,2018-07-31,0.028,1,0.479271,0.520729
70485,2018-08-31,-0.101,0,0.513952,0.486048
70486,2018-09-30,0.013,1,0.483383,0.516617
70487,2018-10-31,0.039,1,0.477990,0.522010


In [None]:
df_hard_predictions_all_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_all_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_all_features_monthly_returns['R1M_Usd']
df_hard_predictions_all_features_monthly_returns['equity_curve'] = df_hard_predictions_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_all_features_monthly_returns))) - 1

0.09978557870948612

In [None]:
(df_hard_predictions_all_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.6999616707661639

## Backtesting Hard Predictions Strategy with Selected Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_selected_features = pd.read_csv("../data/inference_xgboost_untuned_selected_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_selected_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,0,0.536871,0.463129
1,2015-04-30,-0.106,0,0.536871,0.463129
2,2015-05-31,-0.185,0,0.536871,0.463129
3,2015-06-30,-0.418,0,0.546819,0.453181
4,2015-07-31,0.575,0,0.546819,0.453181
...,...,...,...,...,...
70484,2018-07-31,0.028,1,0.430574,0.569426
70485,2018-08-31,-0.101,0,0.525031,0.474969
70486,2018-09-30,0.013,1,0.473333,0.526667
70487,2018-10-31,0.039,0,0.530257,0.469743


In [None]:
df_hard_predictions_selected_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_selected_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_selected_features_monthly_returns['R1M_Usd']
df_hard_predictions_selected_features_monthly_returns['equity_curve'] = df_hard_predictions_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_selected_features_monthly_returns))) - 1

0.0898742403072399

In [None]:
(df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.6398779755001034

## Backtesting Top-200 Strategy with All Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_all_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1,0.443008,0.556992
1,2015-04-30,-0.106,1,0.407426,0.592574
2,2015-05-31,-0.185,1,0.417241,0.582759
3,2015-06-30,-0.418,1,0.392114,0.607886
4,2015-07-31,0.575,1,0.392114,0.607886
...,...,...,...,...,...
70484,2018-07-31,0.028,1,0.479271,0.520729
70485,2018-08-31,-0.101,0,0.513952,0.486048
70486,2018-09-30,0.013,1,0.483383,0.516617
70487,2018-10-31,0.039,1,0.477990,0.522010


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
3895,2013-12-31,-0.143,1,0.263827,0.736173
50794,2013-12-31,-0.084,1,0.311379,0.688621
3253,2013-12-31,-0.020,1,0.317741,0.682259
5936,2013-12-31,-0.078,1,0.345943,0.654057
36072,2013-12-31,0.081,1,0.349769,0.650231
...,...,...,...,...,...
58237,2018-11-30,-0.238,1,0.459861,0.540139
27113,2018-11-30,-0.115,1,0.459999,0.540001
63723,2018-11-30,0.008,1,0.460141,0.539859
62200,2018-11-30,-0.062,1,0.460433,0.539566


In [None]:
df_top_200_all_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_all_features_monthly_returns['growth_factor'] = 1 + df_top_200_all_features_monthly_returns['R1M_Usd']
df_top_200_all_features_monthly_returns['equity_curve'] = df_top_200_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_all_features_monthly_returns))) - 1

0.1261464514697337

In [None]:
(df_top_200_all_features_monthly_returns["R1M_Usd"].mean() / df_top_200_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.7222519665321232

## Backtesting Top-200 Strategy with Selected Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_selected_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,0,0.536871,0.463129
1,2015-04-30,-0.106,0,0.536871,0.463129
2,2015-05-31,-0.185,0,0.536871,0.463129
3,2015-06-30,-0.418,0,0.546819,0.453181
4,2015-07-31,0.575,0,0.546819,0.453181
...,...,...,...,...,...
70484,2018-07-31,0.028,1,0.430574,0.569426
70485,2018-08-31,-0.101,0,0.525031,0.474969
70486,2018-09-30,0.013,1,0.473333,0.526667
70487,2018-10-31,0.039,0,0.530257,0.469743


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
5310,2013-12-31,0.082,1,0.326796,0.673204
53759,2013-12-31,-0.083,1,0.342406,0.657594
3895,2013-12-31,-0.143,1,0.347458,0.652542
36072,2013-12-31,0.081,1,0.352051,0.647949
53393,2013-12-31,-0.094,1,0.354849,0.645151
...,...,...,...,...,...
37478,2018-11-30,-0.203,1,0.462411,0.537589
49588,2018-11-30,-0.082,1,0.462438,0.537562
16075,2018-11-30,-0.144,1,0.462517,0.537483
37921,2018-11-30,-0.021,1,0.462539,0.537461


In [None]:
df_top_200_selected_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_selected_features_monthly_returns['growth_factor'] = 1 + df_top_200_selected_features_monthly_returns['R1M_Usd']
df_top_200_selected_features_monthly_returns['equity_curve'] = df_top_200_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_selected_features_monthly_returns))) - 1

0.111150118774473

In [None]:
(df_top_200_selected_features_monthly_returns["R1M_Usd"].mean() / df_top_200_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.6506864531883617