In [None]:
import numpy as np
import pandas as pd

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
model = XGBClassifier(n_estimators=100, n_jobs=-1, random_state=0) 

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.6608465073993636

In [None]:
df_test["prediction_hard"] = model.predict(X_test)
df_test["predict_prob_0"] = model.predict_proba(X_test)[:,0]
df_test["predict_prob_1"] = model.predict_proba(X_test)[:,1]

In [None]:
df_screen = df_test[["date", "R1M_Usd", "prediction_hard", "predict_prob_0", "predict_prob_1"]]
df_screen

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
3,2015-03-31,0.174,1,0.343092,0.656908
4,2015-04-30,-0.106,1,0.297184,0.702816
5,2015-05-31,-0.185,1,0.343466,0.656534
6,2015-06-30,-0.418,1,0.311662,0.688338
7,2015-07-31,0.575,1,0.205437,0.794563
...,...,...,...,...,...
269373,2018-07-31,0.028,0,0.507529,0.492471
269374,2018-08-31,-0.101,1,0.421466,0.578534
269375,2018-09-30,0.013,1,0.483389,0.516611
269376,2018-10-31,0.039,1,0.466254,0.533746


In [None]:
df_monthly_returns = df_screen.query("prediction_hard == 1").groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.036466,0.963534,0.963534
1,2014-01-31,0.037867,1.037867,1.000019
2,2014-02-28,0.012742,1.012742,1.012762
3,2014-03-31,0.002109,1.002109,1.014898
4,2014-04-30,0.010552,1.010552,1.025607
5,2014-05-31,0.034552,1.034552,1.061044
6,2014-06-30,-0.046768,0.953232,1.011421
7,2014-07-31,0.044432,1.044432,1.05636
8,2014-08-31,-0.05068,0.94932,1.002823
9,2014-09-30,0.065972,1.065972,1.068981


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))
rebalance_dates

['2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31',
 '2017-01-31',
 '2017-02-28',
 '2017-03-31',
 '2017-04-30',
 '2017-05-31',
 '2017-06-30',
 '2017-07-31',
 '2017-08-31',
 '2017-09-30',
 '2017-10-31',
 '2017-11-30',
 '2017-12-31',
 '2018-01-31',
 '2018-02-28',
 '2018-03-31',
 '2018-04-30',
 '2018-05-31',
 '2018-06-30',
 '2018-07-31',
 '2018-08-31',
 '2018-09-30',
 '2018-10-31',
 '2018-11-30']

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="predict_prob_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
194816,2013-12-31,-0.084,1,0.180710,0.819290
14897,2013-12-31,-0.143,1,0.198142,0.801858
58691,2013-12-31,0.122,1,0.227615,0.772385
6145,2013-12-31,-0.001,1,0.240694,0.759306
8114,2013-12-31,0.030,1,0.246319,0.753681
...,...,...,...,...,...
124267,2018-11-30,-0.066,1,0.434294,0.565706
266348,2018-11-30,-0.048,1,0.434369,0.565631
78824,2018-11-30,-0.099,1,0.434524,0.565476
54691,2018-11-30,-0.015,1,0.434831,0.565169


In [None]:
df_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.04831,0.95169,0.95169
1,2014-01-31,0.03706,1.03706,0.98696
2,2014-02-28,0.01357,1.01357,1.000353
3,2014-03-31,0.002625,1.002625,1.002979
4,2014-04-30,0.0069,1.0069,1.009899
5,2014-05-31,0.034625,1.034625,1.044867
6,2014-06-30,-0.038995,0.961005,1.004122
7,2014-07-31,0.04594,1.04594,1.050252
8,2014-08-31,-0.05368,0.94632,0.993874
9,2014-09-30,0.06675,1.06675,1.060215
