In [None]:
import numpy as np
import pandas as pd

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
from sklearn.model_selection import cross_val_score
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0) 
cv_scores = cross_val_score(model, X_train, y_train)
cv_scores

array([0.49777767, 0.48143846, 0.47729683, 0.47214506, 0.50285368])

In [None]:
np.mean(cv_scores)

0.4863023385019446

In [None]:
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.5833678468609526

In [None]:
df_test["prediction_hard"] = model.predict(X_test)
df_test["predict_prob_0"] = model.predict_proba(X_test)[:,0]
df_test["predict_prob_1"] = model.predict_proba(X_test)[:,1]

In [None]:
df_screen = df_test[["date", "R1M_Usd", "prediction_hard", "predict_prob_0", "predict_prob_1"]]
df_screen

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
3,2015-03-31,0.174,1,0.443008,0.556992
4,2015-04-30,-0.106,1,0.407426,0.592574
5,2015-05-31,-0.185,1,0.417241,0.582759
6,2015-06-30,-0.418,1,0.392114,0.607886
7,2015-07-31,0.575,1,0.392114,0.607886
...,...,...,...,...,...
269373,2018-07-31,0.028,1,0.479271,0.520729
269374,2018-08-31,-0.101,0,0.513952,0.486048
269375,2018-09-30,0.013,1,0.483383,0.516617
269376,2018-10-31,0.039,1,0.477990,0.522010


In [None]:
df_monthly_returns = df_screen.query("prediction_hard == 1").groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.032133,0.967867,0.967867
1,2014-01-31,0.040837,1.040837,1.007392
2,2014-02-28,0.012119,1.012119,1.0196
3,2014-03-31,0.002438,1.002438,1.022086
4,2014-04-30,0.010105,1.010105,1.032414
5,2014-05-31,0.035018,1.035018,1.068567
6,2014-06-30,-0.047033,0.952967,1.018309
7,2014-07-31,0.04623,1.04623,1.065385
8,2014-08-31,-0.051997,0.948003,1.009988
9,2014-09-30,0.06714,1.06714,1.077799


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))
rebalance_dates

['2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31',
 '2017-01-31',
 '2017-02-28',
 '2017-03-31',
 '2017-04-30',
 '2017-05-31',
 '2017-06-30',
 '2017-07-31',
 '2017-08-31',
 '2017-09-30',
 '2017-10-31',
 '2017-11-30',
 '2017-12-31',
 '2018-01-31',
 '2018-02-28',
 '2018-03-31',
 '2018-04-30',
 '2018-05-31',
 '2018-06-30',
 '2018-07-31',
 '2018-08-31',
 '2018-09-30',
 '2018-10-31',
 '2018-11-30']

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="predict_prob_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
14897,2013-12-31,-0.143,1,0.263827,0.736173
194816,2013-12-31,-0.084,1,0.311379,0.688621
12525,2013-12-31,-0.020,1,0.317741,0.682259
22751,2013-12-31,-0.078,1,0.345943,0.654057
138503,2013-12-31,0.081,1,0.349769,0.650231
...,...,...,...,...,...
222876,2018-11-30,-0.238,1,0.459861,0.540139
104148,2018-11-30,-0.115,1,0.459999,0.540001
243751,2018-11-30,0.008,1,0.460141,0.539859
237879,2018-11-30,-0.062,1,0.460433,0.539567


In [None]:
df_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.03466,0.96534,0.96534
1,2014-01-31,0.044315,1.044315,1.008119
2,2014-02-28,0.020605,1.020605,1.028891
3,2014-03-31,0.00073,1.00073,1.029642
4,2014-04-30,0.01428,1.01428,1.044346
5,2014-05-31,0.034585,1.034585,1.080464
6,2014-06-30,-0.041815,0.958185,1.035285
7,2014-07-31,0.047145,1.047145,1.084093
8,2014-08-31,-0.058425,0.941575,1.020755
9,2014-09-30,0.06554,1.06554,1.087655
