In [None]:
import numpy as np
import pandas as pd

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
from sklearn.model_selection import cross_val_score
model = XGBClassifier(
    colsample_bytree = 0.5,
    learning_rate = 0.4,
    max_depth = 2,
    n_estimators = 10,
    reg_alpha = 0,
    n_jobs=-1, 
    random_state=0,
) 
cv_scores = cross_val_score(model, X_train, y_train)
cv_scores

array([0.49752513, 0.51702106, 0.51742512, 0.47386232, 0.50282843])

In [None]:
np.mean(cv_scores)

0.5017324107278145

In [None]:
model = XGBClassifier(
    colsample_bytree = 0.5,
    learning_rate = 0.4,
    max_depth = 2,
    n_estimators = 10,
    reg_alpha = 0,
    n_jobs=-1, 
    random_state=0,
)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.5250265164907318

In [None]:
df_test["prediction_hard"] = model.predict(X_test)
df_test["predict_prob_0"] = model.predict_proba(X_test)[:,0]
df_test["predict_prob_1"] = model.predict_proba(X_test)[:,1]

In [None]:
df_screen = df_test[["date", "R1M_Usd", "prediction_hard", "predict_prob_0", "predict_prob_1"]]
df_screen

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
3,2015-03-31,0.174,0,0.501636,0.498364
4,2015-04-30,-0.106,1,0.488307,0.511693
5,2015-05-31,-0.185,1,0.497493,0.502507
6,2015-06-30,-0.418,1,0.492273,0.507727
7,2015-07-31,0.575,1,0.492273,0.507727
...,...,...,...,...,...
269373,2018-07-31,0.028,0,0.525318,0.474682
269374,2018-08-31,-0.101,0,0.525318,0.474682
269375,2018-09-30,0.013,0,0.525238,0.474762
269376,2018-10-31,0.039,0,0.515893,0.484107


In [None]:
df_monthly_returns = df_screen.query("prediction_hard == 1").groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.028535,0.971465,0.971465
1,2014-01-31,0.041095,1.041095,1.011387
2,2014-02-28,0.013767,1.013767,1.025311
3,2014-03-31,0.002358,1.002358,1.027728
4,2014-04-30,0.013183,1.013183,1.041277
5,2014-05-31,0.036366,1.036366,1.079144
6,2014-06-30,-0.047031,0.952969,1.028391
7,2014-07-31,0.045051,1.045051,1.07472
8,2014-08-31,-0.057966,0.942034,1.012423
9,2014-09-30,0.070338,1.070338,1.083635


In [None]:
rebalance_dates = list(np.sort(df_screen["date"].unique()))
rebalance_dates

['2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31',
 '2017-01-31',
 '2017-02-28',
 '2017-03-31',
 '2017-04-30',
 '2017-05-31',
 '2017-06-30',
 '2017-07-31',
 '2017-08-31',
 '2017-09-30',
 '2017-10-31',
 '2017-11-30',
 '2017-12-31',
 '2018-01-31',
 '2018-02-28',
 '2018-03-31',
 '2018-04-30',
 '2018-05-31',
 '2018-06-30',
 '2018-07-31',
 '2018-08-31',
 '2018-09-30',
 '2018-10-31',
 '2018-11-30']

In [None]:
lst_trades = []
for ix_date in rebalance_dates:
    df = df_screen.query("date == @ix_date").sort_values(by="predict_prob_1", ascending=False).head(200).copy()
    lst_trades.append(df)
df_trades = pd.concat(lst_trades)
df_trades

Unnamed: 0,date,R1M_Usd,prediction_hard,predict_prob_0,predict_prob_1
19068,2013-12-31,-0.047,1,0.437366,0.562634
14897,2013-12-31,-0.143,1,0.437366,0.562634
56081,2013-12-31,-0.060,1,0.437366,0.562634
19836,2013-12-31,-0.097,1,0.439028,0.560972
20416,2013-12-31,0.082,1,0.439028,0.560972
...,...,...,...,...,...
102192,2018-11-30,-0.136,1,0.485365,0.514635
98007,2018-11-30,-0.060,1,0.485365,0.514635
153015,2018-11-30,0.171,1,0.485365,0.514635
60946,2018-11-30,-0.078,1,0.485365,0.514635


In [None]:
df_monthly_returns = df_trades.groupby("date")[["R1M_Usd"]].mean().reset_index()
df_monthly_returns['growth_factor'] = 1 + df_monthly_returns['R1M_Usd']
df_monthly_returns['equity_curve'] = df_monthly_returns['growth_factor'].cumprod()
df_monthly_returns

Unnamed: 0,date,R1M_Usd,growth_factor,equity_curve
0,2013-12-31,-0.03441,0.96559,0.96559
1,2014-01-31,0.040445,1.040445,1.004643
2,2014-02-28,0.01665,1.01665,1.021371
3,2014-03-31,0.003435,1.003435,1.024879
4,2014-04-30,0.01456,1.01456,1.039801
5,2014-05-31,0.031855,1.031855,1.072924
6,2014-06-30,-0.044625,0.955375,1.025045
7,2014-07-31,0.04786,1.04786,1.074104
8,2014-08-31,-0.06075,0.93925,1.008852
9,2014-09-30,0.086375,1.086375,1.095991
