## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Backtesting Equally Weighted Market Portfolio

In [None]:
df_market = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_market_monthly_returns = df_market.groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_market_monthly_returns['growth_factor'] = 1 + df_market_monthly_returns['R1M_Usd']
df_market_monthly_returns['equity_curve'] = df_market_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_market_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_market_monthly_returns))) - 1

0.08005470346536026

In [None]:
(df_market_monthly_returns["R1M_Usd"].mean() / df_market_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.633808935267311

## Backtesting Hard Predictions Strategy with All Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_all_features = pd.read_csv("../data/inference_logistic_regression_all_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_all_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.470104,0.529896
1,2015-04-30,-0.106,1.0,0.470435,0.529565
2,2015-05-31,-0.185,1.0,0.470004,0.529996
3,2015-06-30,-0.418,1.0,0.459854,0.540146
4,2015-07-31,0.575,1.0,0.457712,0.542288
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.541795,0.458205
70485,2018-08-31,-0.101,0.0,0.540781,0.459219
70486,2018-09-30,0.013,0.0,0.508696,0.491304
70487,2018-10-31,0.039,0.0,0.506801,0.493199


In [None]:
df_hard_predictions_all_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_all_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_all_features_monthly_returns['R1M_Usd']
df_hard_predictions_all_features_monthly_returns['equity_curve'] = df_hard_predictions_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_all_features_monthly_returns))) - 1

0.09659453049076672

In [None]:
(df_hard_predictions_all_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.671668754158758

## Backtesting Hard Predictions Strategy with Selected Features

In [None]:
df_hard_predictions = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_inferences_selected_features = pd.read_csv("../data/inference_logistic_regression_selected_features.csv")

In [None]:
df_hard_predictions = pd.concat([df_hard_predictions, df_inferences_selected_features], axis=1)
df_hard_predictions

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.469486,0.530514
1,2015-04-30,-0.106,1.0,0.469902,0.530098
2,2015-05-31,-0.185,1.0,0.468476,0.531524
3,2015-06-30,-0.418,1.0,0.475378,0.524622
4,2015-07-31,0.575,1.0,0.471177,0.528823
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.527576,0.472424
70485,2018-08-31,-0.101,0.0,0.531423,0.468577
70486,2018-09-30,0.013,0.0,0.516458,0.483542
70487,2018-10-31,0.039,0.0,0.509134,0.490866


In [None]:
df_hard_predictions_selected_features_monthly_returns = df_hard_predictions.query("hard==1").groupby(["date"])[["R1M_Usd"]].mean().reset_index()
df_hard_predictions_selected_features_monthly_returns['growth_factor'] = 1 + df_hard_predictions_selected_features_monthly_returns['R1M_Usd']
df_hard_predictions_selected_features_monthly_returns['equity_curve'] = df_hard_predictions_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_hard_predictions_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_hard_predictions_selected_features_monthly_returns))) - 1

0.10265756213460264

In [None]:
(df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].mean() / df_hard_predictions_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.7129060010350087

## Backtesting Top-200 Strategy with All Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_all_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.470104,0.529896
1,2015-04-30,-0.106,1.0,0.470435,0.529565
2,2015-05-31,-0.185,1.0,0.470004,0.529996
3,2015-06-30,-0.418,1.0,0.459854,0.540146
4,2015-07-31,0.575,1.0,0.457712,0.542288
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.541795,0.458205
70485,2018-08-31,-0.101,0.0,0.540781,0.459219
70486,2018-09-30,0.013,0.0,0.508696,0.491304
70487,2018-10-31,0.039,0.0,0.506801,0.493199


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
1271,2013-12-31,-0.055,1.0,0.394118,0.605882
28623,2013-12-31,0.114,1.0,0.394824,0.605176
64379,2013-12-31,0.080,1.0,0.397836,0.602164
34486,2013-12-31,-0.035,1.0,0.411254,0.588746
51090,2013-12-31,0.033,1.0,0.412173,0.587827
...,...,...,...,...,...
63821,2018-11-30,-0.206,1.0,0.472301,0.527699
11857,2018-11-30,-0.155,1.0,0.472338,0.527662
32520,2018-11-30,-0.313,1.0,0.472515,0.527485
31282,2018-11-30,-0.100,1.0,0.472560,0.527440


In [None]:
df_top_200_all_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_all_features_monthly_returns['growth_factor'] = 1 + df_top_200_all_features_monthly_returns['R1M_Usd']
df_top_200_all_features_monthly_returns['equity_curve'] = df_top_200_all_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_all_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_all_features_monthly_returns))) - 1

0.1344945032814202

In [None]:
(df_top_200_all_features_monthly_returns["R1M_Usd"].mean() / df_top_200_all_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.7392758913232144

## Backtesting Top-200 Strategy with Selected Features

In [None]:
df_screen = df_test[["date", "R1M_Usd"]].copy().reset_index(drop=True)

In [None]:
df_screen = pd.concat([df_screen, df_inferences_selected_features], axis=1)
df_screen

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
0,2015-03-31,0.174,1.0,0.469486,0.530514
1,2015-04-30,-0.106,1.0,0.469902,0.530098
2,2015-05-31,-0.185,1.0,0.468476,0.531524
3,2015-06-30,-0.418,1.0,0.475378,0.524622
4,2015-07-31,0.575,1.0,0.471177,0.528823
...,...,...,...,...,...
70484,2018-07-31,0.028,0.0,0.527576,0.472424
70485,2018-08-31,-0.101,0.0,0.531423,0.468577
70486,2018-09-30,0.013,0.0,0.516458,0.483542
70487,2018-10-31,0.039,0.0,0.509134,0.490866


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="probability_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,hard,probability_0,probability_1
28623,2013-12-31,0.114,1.0,0.395729,0.604271
50794,2013-12-31,-0.084,1.0,0.400473,0.599527
51090,2013-12-31,0.033,1.0,0.412809,0.587191
64379,2013-12-31,0.080,1.0,0.416037,0.583963
14882,2013-12-31,0.063,1.0,0.422220,0.577780
...,...,...,...,...,...
13433,2018-11-30,-0.036,1.0,0.473852,0.526148
7269,2018-11-30,-0.083,1.0,0.473982,0.526018
16075,2018-11-30,-0.144,1.0,0.474062,0.525938
31932,2018-11-30,-0.169,1.0,0.474307,0.525693


In [None]:
df_top_200_selected_features_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_top_200_selected_features_monthly_returns['growth_factor'] = 1 + df_top_200_selected_features_monthly_returns['R1M_Usd']
df_top_200_selected_features_monthly_returns['equity_curve'] = df_top_200_selected_features_monthly_returns['growth_factor'].cumprod()

In [None]:
((df_top_200_selected_features_monthly_returns['equity_curve'].iloc[-1]) ** (12 / len(df_top_200_selected_features_monthly_returns))) - 1

0.15969790374333703

In [None]:
(df_top_200_selected_features_monthly_returns["R1M_Usd"].mean() / df_top_200_selected_features_monthly_returns["R1M_Usd"].std()) * np.sqrt(12)

0.8443428473551822