In [None]:
import numpy as np
import pandas as pd

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
model = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=-1)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.5316985706348806

In [None]:
df_test["prediction_hard"] = model.predict(X_test)
df_test["predict_prob_0"] = model.predict_proba(X_test)[:,0]
df_test["predict_prob_1"] = model.predict_proba(X_test)[:,1]

In [None]:
df_screen = df_test[["date", "R1M_Usd", "prediction_hard", "predict_prob_0", "predict_prob_1"]]
df_screen

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
3,2015-03-31,0.174,1.0,0.499717,0.500283
4,2015-04-30,-0.106,1.0,0.493392,0.506608
5,2015-05-31,-0.185,1.0,0.497036,0.502964
6,2015-06-30,-0.418,1.0,0.491069,0.508931
7,2015-07-31,0.575,1.0,0.485765,0.514235
...,...,...,...,...,...
269373,2018-07-31,0.028,0.0,0.547788,0.452212
269374,2018-08-31,-0.101,0.0,0.547826,0.452174
269375,2018-09-30,0.013,0.0,0.548498,0.451502
269376,2018-10-31,0.039,0.0,0.541317,0.458683


In [None]:
df_monthly_returns = df_screen.query("prediction_hard == 1").groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.032255,0.967745,0.967745
1,2014-01-31,0.040904,1.040904,1.007329
2,2014-02-28,0.010195,1.010195,1.017599
3,2014-03-31,0.000505,1.000505,1.018112
4,2014-04-30,0.014474,1.014474,1.032849
5,2014-05-31,0.03733,1.03733,1.071405
6,2014-06-30,-0.050275,0.949725,1.017541
7,2014-07-31,0.046506,1.046506,1.064863
8,2014-08-31,-0.056308,0.943692,1.004903
9,2014-09-30,0.072662,1.072662,1.077921


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))
rebalance_dates

['2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31',
 '2017-01-31',
 '2017-02-28',
 '2017-03-31',
 '2017-04-30',
 '2017-05-31',
 '2017-06-30',
 '2017-07-31',
 '2017-08-31',
 '2017-09-30',
 '2017-10-31',
 '2017-11-30',
 '2017-12-31',
 '2018-01-31',
 '2018-02-28',
 '2018-03-31',
 '2018-04-30',
 '2018-05-31',
 '2018-06-30',
 '2018-07-31',
 '2018-08-31',
 '2018-09-30',
 '2018-10-31',
 '2018-11-30']

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="predict_prob_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
14897,2013-12-31,-0.143,1.0,0.470221,0.529779
13913,2013-12-31,-0.068,1.0,0.471442,0.528558
32761,2013-12-31,-0.022,1.0,0.472757,0.527243
56081,2013-12-31,-0.060,1.0,0.474221,0.525779
12870,2013-12-31,-0.077,1.0,0.475476,0.524524
...,...,...,...,...,...
206807,2018-11-30,-0.087,1.0,0.490301,0.509699
66467,2018-11-30,-0.072,1.0,0.490318,0.509682
75220,2018-11-30,0.015,1.0,0.490341,0.509659
130664,2018-11-30,-0.093,1.0,0.490345,0.509655


In [None]:
df_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.0331,0.9669,0.9669
1,2014-01-31,0.041125,1.041125,1.006664
2,2014-02-28,0.021355,1.021355,1.028161
3,2014-03-31,-0.000135,0.999865,1.028022
4,2014-04-30,0.01706,1.01706,1.04556
5,2014-05-31,0.035565,1.035565,1.082746
6,2014-06-30,-0.05567,0.94433,1.022469
7,2014-07-31,0.04811,1.04811,1.07166
8,2014-08-31,-0.059985,0.940015,1.007377
9,2014-09-30,0.08133,1.08133,1.089307
