## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Cross-Validation Fitting to All Features

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [None]:
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0)
cv_scores_all_features = cross_val_score(model, X_train, y_train)
cv_scores_all_features

array([0.49777767, 0.48143846, 0.47729683, 0.47214506, 0.50285368])

In [None]:
cv_scores_all_features.mean()

0.4863023385019446

## Cross-Validation Fitting to Selected Features

In [None]:
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0)
cv_scores_selected_features = cross_val_score(model, X_train[features_selected], y_train)
cv_scores_selected_features

array([0.50957119, 0.51785444, 0.51752614, 0.50133845, 0.50404061])

In [None]:
cv_scores_selected_features.mean()

0.5100661649578262

## Final Fitting to All Features

In [None]:
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)

In [None]:
hard_predictions = model.predict(X_test)
hard_predictions

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
prob_predict_0 = model.predict_proba(X_test)[:,0]
prob_predict_1 = model.predict_proba(X_test)[:,1]

In [None]:
df_inference_all_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_all_features

Unnamed: 0,hard,probability_0,probability_1
0,1,0.443008,0.556992
1,1,0.407426,0.592574
2,1,0.417241,0.582759
3,1,0.392114,0.607886
4,1,0.392114,0.607886
...,...,...,...
70484,1,0.479271,0.520729
70485,0,0.513952,0.486048
70486,1,0.483383,0.516617
70487,1,0.477990,0.522010


In [None]:
df_inference_all_features.to_csv("../data/inference_xgboost_untuned_all_features.csv", index=False)

## Final Fitting to Selected Features

In [None]:
model = XGBClassifier(n_estimators=25, n_jobs=-1, random_state=0)
model.fit(X_train[features_selected], y_train)

In [None]:
hard_predictions = model.predict(X_test[features_selected])
hard_predictions

array([0, 0, 0, ..., 1, 0, 1])

In [None]:
prob_predict_0 = model.predict_proba(X_test[features_selected])[:,0]
prob_predict_1 = model.predict_proba(X_test[features_selected])[:,1]

In [None]:
df_inference_selected_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_selected_features

Unnamed: 0,hard,probability_0,probability_1
0,0,0.536871,0.463129
1,0,0.536871,0.463129
2,0,0.536871,0.463129
3,0,0.546819,0.453181
4,0,0.546819,0.453181
...,...,...,...
70484,1,0.430574,0.569426
70485,0,0.525031,0.474969
70486,1,0.473333,0.526667
70487,0,0.530257,0.469743


In [None]:
df_inference_selected_features.to_csv("../data/inference_xgboost_untuned_selected_features.csv", index=False)