## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Cross-Validation Fitting to All Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
model = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=-1)
cv_scores_all_features = cross_val_score(model, X_train, y_train)
cv_scores_all_features

array([0.49747462, 0.51133896, 0.51921814, 0.50729835, 0.50282843])

In [None]:
cv_scores_all_features.mean()

0.5076316985706348

## Cross-Validation Fitting to Selected Features

In [None]:
model = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=-1)
cv_scores_selected_features = cross_val_score(model, X_train[features_selected], y_train)
cv_scores_selected_features

array([0.51487449, 0.51532906, 0.51947068, 0.51679378, 0.50351028])

In [None]:
cv_scores_selected_features.mean()

0.5139956563462802

## Final Fitting to All Features

In [None]:
model = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
hard_predictions = model.predict(X_test)
hard_predictions

array([1., 1., 1., ..., 0., 0., 0.])

In [None]:
prob_predict_0 = model.predict_proba(X_test)[:,0]
prob_predict_1 = model.predict_proba(X_test)[:,1]

In [None]:
df_inference_all_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_all_features

Unnamed: 0,hard,probability_0,probability_1
0,1.0,0.499717,0.500283
1,1.0,0.493392,0.506608
2,1.0,0.497036,0.502964
3,1.0,0.491069,0.508931
4,1.0,0.485765,0.514235
...,...,...,...
70484,0.0,0.547788,0.452212
70485,0.0,0.547826,0.452174
70486,0.0,0.548498,0.451502
70487,0.0,0.541317,0.458683


In [None]:
df_inference_all_features.to_csv("../data/inference_random_forest_untuned_all_features.csv", index=False)

## Final Fitting to Selected Features

In [None]:
model = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=-1)
model.fit(X_train[features_selected], y_train)

In [None]:
hard_predictions = model.predict(X_test[features_selected])
hard_predictions

array([1., 1., 1., ..., 0., 0., 0.])

In [None]:
prob_predict_0 = model.predict_proba(X_test[features_selected])[:,0]
prob_predict_1 = model.predict_proba(X_test[features_selected])[:,1]

In [None]:
df_inference_selected_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_selected_features

Unnamed: 0,hard,probability_0,probability_1
0,1.0,0.489014,0.510986
1,1.0,0.486188,0.513812
2,1.0,0.486254,0.513746
3,1.0,0.489839,0.510161
4,1.0,0.489839,0.510161
...,...,...,...
70484,0.0,0.548990,0.451010
70485,0.0,0.542726,0.457274
70486,0.0,0.539801,0.460199
70487,0.0,0.539634,0.460366


In [None]:
df_inference_selected_features.to_csv("../data/inference_random_forest_untuned_selected_features.csv", index=False)