## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Reading and Separating Data

In [None]:
df_ml = pd.read_csv("../data/data_ml.csv")

In [None]:
separation_date = "2013-12-31"
df_train = df_ml.query("date < @separation_date").copy()
df_test = df_ml.query("@separation_date <= date & date < '2018-12-31'").copy()

In [None]:
columns_to_drop = [
    "stock_id", "date", # non-feature identifiers
    "R1M_Usd", "R3M_Usd", "R6M_Usd", "R12M_Usd", # numerical labels
    "R1M_Usd_C", "R12M_Usd_C" # categorical labels
]

features = list(df_ml.drop(columns=columns_to_drop).columns)

In [None]:
X_train = df_train[features]
y_train = df_train["R1M_Usd_C"]

In [None]:
X_test = df_test[features]
y_test = df_test["R1M_Usd_C"]

In [None]:
df_features_selected = pd.read_csv("../data/features_selected.csv")
features_selected = list(df_features_selected["feature"].values)

## Cross-Validation Fitting Logistic Regression to All Features

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
model = LogisticRegression()
cv_scores_all_features = cross_val_score(model, X_train, y_train)
cv_scores_all_features

array([0.51434416, 0.52189505, 0.52598616, 0.51686954, 0.51346028])

In [None]:
cv_scores_all_features.mean()

0.5185110359109046

## Cross-Validation Fitting Logistic Regression to Selected Features

In [None]:
model = LogisticRegression()
cv_scores_selected_features = cross_val_score(model, X_train[features_selected], y_train)
cv_scores_selected_features

array([0.51482398, 0.51984949, 0.52025355, 0.51899086, 0.51343502])

In [None]:
cv_scores_selected_features.mean()

0.5174705793221881

## Final Fitting to All Features

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
hard_predictions = model.predict(X_test)
hard_predictions

array([1., 1., 1., ..., 0., 0., 0.])

In [None]:
prob_predict_0 = model.predict_proba(X_test)[:,0]
prob_predict_1 = model.predict_proba(X_test)[:,1]

In [None]:
df_inference_all_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_all_features

Unnamed: 0,hard,probability_0,probability_1
0,1.0,0.470104,0.529896
1,1.0,0.470435,0.529565
2,1.0,0.470004,0.529996
3,1.0,0.459854,0.540146
4,1.0,0.457712,0.542288
...,...,...,...
70484,0.0,0.541795,0.458205
70485,0.0,0.540781,0.459219
70486,0.0,0.508696,0.491304
70487,0.0,0.506801,0.493199


In [None]:
df_inference_all_features.to_csv("../data/inference_logistic_regression_all_features.csv", index=False)

## Final Fitting to Selected Features

In [None]:
model = LogisticRegression()
model.fit(X_train[features_selected], y_train)

In [None]:
hard_predictions = model.predict(X_test[features_selected])
hard_predictions

array([1., 1., 1., ..., 0., 0., 0.])

In [None]:
prob_predict_0 = model.predict_proba(X_test[features_selected])[:,0]
prob_predict_1 = model.predict_proba(X_test[features_selected])[:,1]

In [None]:
df_inference_selected_features = pd.DataFrame({
    "hard": hard_predictions,
    "probability_0": prob_predict_0,
    "probability_1": prob_predict_1,
})
df_inference_selected_features

Unnamed: 0,hard,probability_0,probability_1
0,1.0,0.469486,0.530514
1,1.0,0.469902,0.530098
2,1.0,0.468476,0.531524
3,1.0,0.475378,0.524622
4,1.0,0.471177,0.528823
...,...,...,...
70484,0.0,0.527576,0.472424
70485,0.0,0.531423,0.468577
70486,0.0,0.516458,0.483542
70487,0.0,0.509134,0.490866


In [None]:
df_inference_selected_features.to_csv("../data/inference_logistic_regression_selected_features.csv", index=False)