## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Backtesting Equally Weighted Market Portfolio

In [None]:
df_market = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_market_monthly_returns = df_market.groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_market_monthly_returns['growth_factor'] = 1 + df_market_monthly_returns['R1M_Usd']
df_market_monthly_returns['equity_curve'] = df_market_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_market_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_market_monthly_returns))) - 1

0.08005470346536026

In [None]:
(df_market_monthly_returns["R1M_Usd"].mean() / df_market_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.633808935267311

## Backtesting Hard Predictions Strategy with All Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_all_features = pd.read_csv("../data/inference_random_forest_untuned_all_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_all_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.499717,0.500283
1,2015-04-30,-0.106,1.0,0.493392,0.506608
2,2015-05-31,-0.185,1.0,0.497036,0.502964
3,2015-06-30,-0.418,1.0,0.491069,0.508931
4,2015-07-31,0.575,1.0,0.485765,0.514235
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.547788,0.452212
70485,2018-08-31,-0.101,0.0,0.547826,0.452174
70486,2018-09-30,0.013,0.0,0.548498,0.451502
70487,2018-10-31,0.039,0.0,0.541317,0.458683


In [None]:
df_hard_predictions_all_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_all_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_all_features_monthly_returns['R1M_Usd']
df_hard_predictions_all_features_monthly_returns['equity_curve'] = df_hard_predictions_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_all_features_monthly_returns))) - 1

0.09579464029372597

In [None]:
(df_hard_predictions_all_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.6875318804255143

## Backtesting Hard Predictions Strategy with Selected Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_selected_features = pd.read_csv("../data/inference_random_forest_untuned_selected_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_selected_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.489014,0.510986
1,2015-04-30,-0.106,1.0,0.486188,0.513812
2,2015-05-31,-0.185,1.0,0.486254,0.513746
3,2015-06-30,-0.418,1.0,0.489839,0.510161
4,2015-07-31,0.575,1.0,0.489839,0.510161
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.548990,0.451010
70485,2018-08-31,-0.101,0.0,0.542726,0.457274
70486,2018-09-30,0.013,0.0,0.539801,0.460199
70487,2018-10-31,0.039,0.0,0.539634,0.460366


In [None]:
df_hard_predictions_selected_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_selected_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_selected_features_monthly_returns['R1M_Usd']
df_hard_predictions_selected_features_monthly_returns['equity_curve'] = df_hard_predictions_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_selected_features_monthly_returns))) - 1

0.09176931678667954

In [None]:
(df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.6469248883152346

## Backtesting Top-200 Strategy with All Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_all_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.499717,0.500283
1,2015-04-30,-0.106,1.0,0.493392,0.506608
2,2015-05-31,-0.185,1.0,0.497036,0.502964
3,2015-06-30,-0.418,1.0,0.491069,0.508931
4,2015-07-31,0.575,1.0,0.485765,0.514235
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.547788,0.452212
70485,2018-08-31,-0.101,0.0,0.547826,0.452174
70486,2018-09-30,0.013,0.0,0.548498,0.451502
70487,2018-10-31,0.039,0.0,0.541317,0.458683


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
3895,2013-12-31,-0.143,1.0,0.470221,0.529779
3652,2013-12-31,-0.068,1.0,0.471442,0.528558
8438,2013-12-31,-0.022,1.0,0.472757,0.527243
14516,2013-12-31,-0.060,1.0,0.474221,0.525779
3367,2013-12-31,-0.077,1.0,0.475476,0.524524
...,...,...,...,...,...
53944,2018-11-30,-0.087,1.0,0.490301,0.509699
17226,2018-11-30,-0.072,1.0,0.490318,0.509682
19586,2018-11-30,0.015,1.0,0.490341,0.509659
34035,2018-11-30,-0.093,1.0,0.490345,0.509655


In [None]:
df_top_200_all_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_all_features_monthly_returns['growth_factor'] = 1 + df_top_200_all_features_monthly_returns['R1M_Usd']
df_top_200_all_features_monthly_returns['equity_curve'] = df_top_200_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_all_features_monthly_returns))) - 1

0.18479709551964008

In [None]:
(df_top_200_all_features_monthly_returns["R1M_Usd"].mean() / df_top_200_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.9738602282823526

## Backtesting Top-200 Strategy with Selected Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_selected_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.489014,0.510986
1,2015-04-30,-0.106,1.0,0.486188,0.513812
2,2015-05-31,-0.185,1.0,0.486254,0.513746
3,2015-06-30,-0.418,1.0,0.489839,0.510161
4,2015-07-31,0.575,1.0,0.489839,0.510161
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.548990,0.451010
70485,2018-08-31,-0.101,0.0,0.542726,0.457274
70486,2018-09-30,0.013,0.0,0.539801,0.460199
70487,2018-10-31,0.039,0.0,0.539634,0.460366


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
14516,2013-12-31,-0.060,1.0,0.471717,0.528283
14357,2013-12-31,-0.076,1.0,0.474764,0.525236
11241,2013-12-31,-0.057,1.0,0.476402,0.523598
8438,2013-12-31,-0.022,1.0,0.476753,0.523247
14216,2013-12-31,-0.039,1.0,0.479568,0.520432
...,...,...,...,...,...
31932,2018-11-30,-0.169,1.0,0.490727,0.509273
19235,2018-11-30,-0.052,1.0,0.490743,0.509257
29458,2018-11-30,-0.202,1.0,0.490786,0.509214
32761,2018-11-30,-0.225,1.0,0.490801,0.509199


In [None]:
df_top_200_selected_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_selected_features_monthly_returns['growth_factor'] = 1 + df_top_200_selected_features_monthly_returns['R1M_Usd']
df_top_200_selected_features_monthly_returns['equity_curve'] = df_top_200_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_selected_features_monthly_returns))) - 1

0.15409358555484154

In [None]:
(df_top_200_selected_features_monthly_returns["R1M_Usd"].mean() / df_top_200_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.8374881081242436