## Goal
Analyse spurious era to remove from training.

## 2. Methodology
Create two sets of data.

* Full

* Subsampled - Take 10% of data from each era.

### 2.1 Data exploration
* Look at data size by era
* Look at number of NAs by era
* Look at target value distribution by era

### 2.2 Model training
* Split the validation test into equal 2 parts. test1, test2.
* Subsample every 4th era, bin eras in era-bin-sz of 50 [1-50, 50-100] etc.
* Along the way I plan to save the feature importance values as this will be critical for the next exploration.
* With each bin in the train set, train a model and then predict against a single validation bin, ie., test1.
* I will remove then remove various proportions of era bins based on performance and select the best proportion to remove based on perf on test1.
* I will then report final metrics on test2.

## 3. Observations

### Num stocks by era
1. Feels like until era 75, the data is suspiciously small.
2. Starting era 150 though we have had almost a stable number of stocks.
3. Era 300-350, there is some interesting stuff happening here where the number of stocks has taken a nose dive.

### Num of NA%
Only Jerome has NAs



In [319]:
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np
import datetime
from pprint import pprint, pformat
import utils as ut
import json
import os
import os.path
import pickle



In [248]:
TEST_MODE = True

if TEST_MODE:
    ML_TRACKING_SERVER_URI = "http://127.0.0.1:5000"
    AWS_CREDENTIALS_FILE = "~/.aws/personal_credentials"
    DATA_PATH = "./data/"
else:
    ML_TRACKING_SERVER_URI = "http://18.218.213.146:5500/"
    AWS_CREDENTIALS_FILE = "~/.aws/credentials"
    DATA_PATH = "/numerai/data/"
DATASET_S3_PATH = "s3://numerai-v1/dataset/v4.1/"
OUTPUT_S3_PATH = "s3://numerai-v1/experiments/"
FILES_TO_DOWNLOAD = [
    "train",
    "splits/test_p0-50",
    "splits/test_p50-100",
    #"splits/traintest_p0-100",
    "features.json",
]
DATE_COL = "date"




In [298]:
targets = [
    "target_nomi_v4_20",
    "target_jerome_v4_60",
    "target_ralph_v4_20",
    "target_tyler_v4_20",
    "target_victor_v4_20",
    "target_waldo_v4_20",
]
parameters_test = {
    "sample_every_nth": 1,
    "int8": True,
    "features": "small",
    "impute": True,
    "lgbm_setting": "sm_lgbm",
    "lgbm_params": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
}    
parameters_full = {
    "sample_every_nth": 4,
    "int8": True,
    "features": None,
    "impute": True,
    "lgbm_setting": "lg_lgbm_10kest_every4_lr0.005_int8",
    "lgbm_params": {
        "n_estimators": 10_000,
        "learning_rate": 0.005,
        "max_depth": 6,
        "num_leaves": 2**6,
        "colsample_bytree": 0.1,
        "n_jobs": -1,
    },
}
if TEST_MODE:
    parameters = parameters_test.copy()
else:
    parameters = parameters_full.copy()
st_time = datetime.datetime.now().strftime("%Y-%m-%d_%Hh-%Mm")
parameters.update(
    {
        "ensembling": "average",
        "targets": targets,
        "cv": 2,
        "embargo": 12,
        "expt_name": f"era_analysis__{st_time}"
    }
)

log_dir = os.path.join(DATA_PATH, "experiments", parameters["expt_name"])
os.makedirs(log_dir, exist_ok=True)
print(f"Making log dir {log_dir}")
log = ut.Logger(root_dir=log_dir)
log.info(f"Era bin size {ERA_BIN_SZ}")
log.info(json.dumps(parameters, sort_keys=True, indent=4))

####################################
# ERA BIN STUFF
ERA_BIN_SZ = 50
ERA_BIN_COL = f"era_bin{ERA_BIN_SZ}"

# This fractions of era bins will be included for testing against an unseen
# portion of validation (test_p50-100).
# The era bins will first be sorted by the sharpe values of the models trained
# with them against another portion of validation data (test_p0-50).
ERA_BIN_PROPORTIONS = [0.8, 0.9, 0.95, 1.]
log.info(f"{ERA_BIN_COL=}, {ERA_BIN_PROPORTIONS=}")

Making log dir ./data/experiments/era_analysis__2023-04-20_16h-56m
[2023-04-20 16:56:46]  Era bin size 50
[2023-04-20 16:56:46]  {
    "cv": 2,
    "embargo": 12,
    "ensembling": "average",
    "expt_name": "era_analysis__2023-04-20_16h-56m",
    "features": "small",
    "impute": true,
    "int8": true,
    "lgbm_params": {
        "colsample_bytree": 0.1,
        "learning_rate": 0.01,
        "max_depth": 5,
        "n_estimators": 2000,
        "num_leaves": 32
    },
    "lgbm_setting": "sm_lgbm",
    "sample_every_nth": 1,
    "targets": [
        "target_nomi_v4_20",
        "target_jerome_v4_60",
        "target_ralph_v4_20",
        "target_tyler_v4_20",
        "target_victor_v4_20",
        "target_waldo_v4_20"
    ]
}
[2023-04-20 16:56:46]  ERA_BIN_COL='era_bin50', ERA_BIN_PROPORTIONS=[0.8, 0.9, 0.95, 1.0]


In [None]:
%load_ext autoreload
%autoreload 2

import warnings
import flatdict
import pandas as pd
import os
import os.path
import mlflow
import gc
import functools
import plotly
import plotly.express as px
import plotly.offline as pyo
import cufflinks as cf
import datetime

import nmr_utils 

from lightgbm import LGBMRegressor
from tqdm.notebook import tqdm

from nmr_utils import (
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
)

pyo.init_notebook_mode()
cf.go_offline()
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option("display.max_rows", 100)

DF = pd.DataFrame
mlflow.lightgbm.autolog()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [251]:
from mlflow.utils.autologging_utils import _logger

warnings.filterwarnings("ignore", message="Inferred schema contains integer column", module="mlflow.models.signature")
_logger.disabled = True

# Filter the setuptools UserWarning until we stop relying on distutils
warnings.filterwarnings(
    "ignore",
    message="Setuptools is replacing distutils.",
    category=UserWarning,
    module="_distutils_hack",
)



In [252]:
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
import logging
logging.basicConfig(level=logging.INFO)



## 1. Data loading
### 1.1 Download data

In [18]:
ALL_EXPT_ROOT_DIR = os.path.join(DATA_PATH, "experiments/")
EXPT_SAVE_ROOTDIR = os.path.join(DATA_PATH, "experiments/", parameters["expt_name"])
MODEL_DIR = os.path.join(EXPT_SAVE_ROOTDIR, "models/")
ARTIFACTS_DIR = os.path.join(EXPT_SAVE_ROOTDIR, "artifacts/")
for fld in (EXPT_SAVE_ROOTDIR, MODEL_DIR, ARTIFACTS_DIR):
    os.makedirs(fld, exist_ok=True)
    log.info(f"Making folder: {fld}")
PARAMS_FL = os.path.join(EXPT_SAVE_ROOTDIR, "params.json")
ut.save_json(obj=parameters, fl=PARAMS_FL)
log.info(f"saving fl: {PARAMS_FL}")

[2023-04-20 14:20:54]  Making folder: ./data/experiments/era_analysis__2023-04-20_14h-20m
[2023-04-20 14:20:54]  Making folder: ./data/experiments/era_analysis__2023-04-20_14h-20m/models/
[2023-04-20 14:20:54]  Making folder: ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/
[2023-04-20 14:20:54]  saving fl: ./data/experiments/era_analysis__2023-04-20_14h-20m/params.json


In [6]:
ut.download_fls(
    data_path=DATA_PATH,
    s3_path=DATASET_S3_PATH,
    s3_files=FILES_TO_DOWNLOAD,
    aws_credential_fl=AWS_CREDENTIALS_FILE,
    int8=parameters["int8"],
)

[2023-04-20 14:20:02]  Downloading: train_int8.parquet
[2023-04-20 14:20:02]  Loading aws credenitals from ~/.aws/personal_credentials...
[2023-04-20 14:20:02]  Would have downloaded s3://numerai-v1/dataset/v4.1/train_int8.parquet to ./data/train_int8.parquet. But ./data/train_int8.parquet exists. Will not download again ...
[2023-04-20 14:20:02]  Downloading: splits/test_p0-50_int8.parquet
[2023-04-20 14:20:02]  Loading aws credenitals from ~/.aws/personal_credentials...
[2023-04-20 14:20:02]  Would have downloaded s3://numerai-v1/dataset/v4.1/splits/test_p0-50_int8.parquet to ./data/test_p0-50_int8.parquet. But ./data/test_p0-50_int8.parquet exists. Will not download again ...
[2023-04-20 14:20:02]  Downloading: splits/test_p50-100_int8.parquet
[2023-04-20 14:20:02]  Loading aws credenitals from ~/.aws/personal_credentials...
[2023-04-20 14:20:02]  Would have downloaded s3://numerai-v1/dataset/v4.1/splits/test_p50-100_int8.parquet to ./data/test_p50-100_int8.parquet. But ./data/test_

### 1.2 Load the data into memory

In [7]:
read_cols = ut.build_cols_to_read(
    feature_json_fl=os.path.join(DATA_PATH, "features.json"),
    feature_set_name=parameters["features"],
)
features = [c for c in read_cols if c.startswith("feature")]
log.info(f"{len(read_cols)=}, {len(features)=}")

[2023-04-20 14:20:03]  len(read_cols)=63, len(features)=32


In [8]:
load_df = ut.get_df_loader(
    data_path=DATA_PATH,
    features=features,
    read_cols=read_cols,
    impute=parameters["impute"],
    int8=parameters["int8"],
    sample_every_nth=parameters["sample_every_nth"],
)



In [9]:
def print_df_info(df, name=""):
    log.info(f"{name} info: {df.shape=}, {df.era.nunique()=}")



In [10]:
def group_eras_into_bins(df, biz_sz=ERA_BIN_SZ, era_col=ERA_COL):
    """Groups eras into bins of biz_sz eras each."""
    eras = df[era_col].unique()
    era_bins = np.array_split(eras, len(eras) / biz_sz)
    era_bin_map = {era: i for i, era_bin in enumerate(era_bins, start=1) for era in era_bin}
    return df[era_col].map(era_bin_map)



In [11]:
int8_suf = "_int8" if parameters["int8"] else ""
df_map = {}
for ident, fl, name in [
    ("train", f"train{int8_suf}.parquet", "Train"),
    ("test1", f"test_p0-50{int8_suf}.parquet", "Test 1"),
    ("test2", f"test_p50-100{int8_suf}.parquet", "Test 2"),
]:
    df_map[ident] = df = load_df(fl_name=fl)
    df[ERA_BIN_COL] = group_eras_into_bins(df=df, biz_sz=ERA_BIN_SZ)
    era_date_map = ut.get_era_to_date(df.era.unique())
    df[DATE_COL] = df.era.map(era_date_map)
    print_df_info(df=df, name=name)

[2023-04-20 14:20:07]  Train info: df.shape=(2420521, 65), df.era.nunique()=574
[2023-04-20 14:20:09]  Test 1 info: df.shape=(1215787, 65), df.era.nunique()=241
[2023-04-20 14:20:11]  Test 2 info: df.shape=(1251330, 65), df.era.nunique()=242


In [12]:
train_df, test1_df, test2_df = df_map["train"], df_map["test1"], df_map["test2"]



## 2. Data exploration

In [14]:
mlflow.set_tracking_uri(ML_TRACKING_SERVER_URI)



In [15]:
train_tgt_df = train_df[[ERA_COL] + targets]
train_era_grp = train_tgt_df.groupby(ERA_COL)



### 2.1 Num stocks in each era

In [16]:
fig_era_cnt = train_era_grp.size().iplot(
    kind="bar",
    layout={
        "xaxis": {"title": "Era"},
        "yaxis": {"title": "Number of stocks"},
        "title": "Number of stocks in each era",
    },
    asFigure=True,
)
fig_era_cnt



In [17]:
fig_date_cnt = train_df.groupby(DATE_COL).size().iplot(
    kind="bar",
    layout={
        "xaxis": {"title": "Approx era date"},
        "yaxis": {"title": "Number of stocks"},
        "title": "Number of stocks in each era (approx date)",
    },
    asFigure=True,
)
fig_date_cnt



In [19]:
fig_era_cnt.write_html(os.path.join(ARTIFACTS_DIR, "era_stock_cnt.html"))
fig_date_cnt.write_html(os.path.join(ARTIFACTS_DIR, "era_date_cnt.html"))



### 2.2 Num of NAs in each era

In [327]:
all_targets = [c for c in train_df.columns if c.startswith('target')]
all_targets

['target',
 'target_nomi_v4_20',
 'target_nomi_v4_60',
 'target_tyler_v4_20',
 'target_tyler_v4_60',
 'target_victor_v4_20',
 'target_victor_v4_60',
 'target_ralph_v4_20',
 'target_ralph_v4_60',
 'target_waldo_v4_20',
 'target_waldo_v4_60',
 'target_jerome_v4_20',
 'target_jerome_v4_60',
 'target_janet_v4_20',
 'target_janet_v4_60',
 'target_ben_v4_20',
 'target_ben_v4_60',
 'target_alan_v4_20',
 'target_alan_v4_60',
 'target_paul_v4_20',
 'target_paul_v4_60',
 'target_george_v4_20',
 'target_george_v4_60',
 'target_william_v4_20',
 'target_william_v4_60',
 'target_arthur_v4_20',
 'target_arthur_v4_60',
 'target_thomas_v4_20',
 'target_thomas_v4_60']



In [None]:
fig_na_era = (
    train_df.groupby("era")
    .apply(lambda x: x[all_targets].isna().sum() / len(x) * 100.)
).iplot(
    kind="line",
    layout={
        "xaxis": {"title": "Era"},
        "yaxis": {"title": "Target NaN Percent"},
        "title": "Percent of NaN targets in each era",
    },
    asFigure=True,
)
fig_na_era

In [331]:
eranan_fl = os.path.join(ARTIFACTS_DIR, "era_nan_pct.html")
fig_na_era.write_html(eranan_fl)
log.info(f"Saving file {eranan_fl}")

[2023-04-20 17:22:15]  Saving file ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/era_nan_pct.html


In [29]:
analysis_expt_name = parameters["expt_name"] + "_data_exploration1"
try:
    expt_id = mlflow.create_experiment(name=analysis_expt_name)
except Exception:
    expt_id = mlflow.get_experiment_by_name(name=analysis_expt_name)
with mlflow.start_run(run_name="era-exploration", experiment_id=expt_id) as run:
    mlflow.log_params(params=parameters)
    mlflow.log_artifacts(local_dir=ARTIFACTS_DIR)



### 2.3 Target distribution by era

In [30]:
def erawise_target_value_percent(df, target) -> pd.DataFrame:
    """Returns a dataframe with the target value percents for each era."""
    value_counts = (
        df.groupby("era")[target]
        .value_counts(normalize=True)
        .rename("percent")
        .reset_index()
    )
    # Pivot the resulting dataframe so that the index is "era", the columns
    # are "target", and the values are "percent".
    return 100.0 * value_counts.pivot(index="era", columns=target, values="percent")



In [31]:
erawise_target_value_percent(train_df, target=targets[1])

target_jerome_v4_60,0.00,0.25,0.50,0.75,1.00
era,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0001,4.847310,20.116335,49.927290,20.067862,5.041202
0002,4.982206,20.017794,50.000000,19.973310,5.026690
0003,4.991394,20.180723,49.612737,20.309811,4.905336
0004,4.981164,20.050230,50.020929,19.924655,5.023022
0005,5.010438,20.000000,50.020877,20.041754,4.926931
...,...,...,...,...,...
0570,4.966471,19.991618,50.041911,19.991618,5.008382
0571,5.017771,19.966548,50.010454,19.987456,5.017771
0572,5.078699,19.937041,49.947534,20.020986,5.015740
0573,5.057712,19.979014,49.968520,19.979014,5.015740




#### Observation, seems like for all eras, the distribution of the target values is always the same (regardless of the target).

## 2. Model training

In [34]:
if TEST_MODE:
    frac = 0.01
    train_mdl_df = ut.sample_within_eras(train_df, frac=frac)
    print_df_info(train_mdl_df, name="Train small")
else:
    train_mdl_df = train_df
print_df_info(train_mdl_df, name="Data used for training")

[2023-04-20 14:26:28]  Data used for training info: df.shape=(24198, 65), df.era.nunique()=574


In [35]:
log.info("Ensuring that the data looks right!")
train_mdl_df[['era', 'era_bin50']].sample(5)

[2023-04-20 14:26:31]  Ensuring that the data looks right!


Unnamed: 0_level_0,Unnamed: 1_level_0,era,era_bin50
era,id,Unnamed: 2_level_1,Unnamed: 3_level_1
310,nc6c56e224c571b8,310,6
102,nd43e5f1a32d7691,102,2
99,nb6c546fa17e184c,99,2
423,n393bd5c1df36ff5,423,9
175,nd805ba0ed8ec466,175,4




In [39]:
mlflow.lightgbm.autolog()
bin_mdl_expt_id = mlflow.create_experiment(name=parameters["expt_name"] + "_erabin_models")



### 2.1 Build one model per era bin

In [36]:
def build_model_for_each_erabin(
    train_df,
    test_df: pd.DataFrame,
    era_bin_col,
    features: list[str],
    train_ident: str,
    target: str,
    model_rootdir: str,
    params: dict,
    expt_id: str,
):
    """Builds a model for each era bin."""
    # Build a model for each era bin.
    era_bins = train_df[era_bin_col].unique()
    bin_model_data_map = {}
    for era_bin in tqdm(era_bins, desc="Training model for era bin"):
        # Get the train and test data for this era bin.
        era_bin_train_df = train_df[train_df[era_bin_col] == era_bin]
        log.info(
            f"Building model for era bin: {era_bin}\n"
            f"Size of train set {len(era_bin_train_df):,}"
        )
        bin_model_data_map[era_bin] = ut.build_model_for_target(
            train_df=era_bin_train_df,
            test_df=test_df,
            features=features,
            train_ident=train_ident,
            target=target,
            model_rootdir=model_rootdir,
            params=params,
            expt_id=expt_id,
            suffix=f"_erabin{era_bin}_binsz{ERA_BIN_SZ}",
        )
    return bin_model_data_map



In [41]:
bin_model_data_map = build_model_for_each_erabin(
    train_df=train_mdl_df,
    test_df=test1_df,
    era_bin_col=ERA_BIN_COL,
    features=features,
    train_ident="train_int8",
    target=TARGET_COL,
    model_rootdir=MODEL_DIR,
    params=parameters,
    expt_id=bin_mdl_expt_id,
)
log.info(f"Target col {TARGET_COL}")

[2023-04-20 14:28:11]  Building model for era bin: 1
[2023-04-20 14:28:11]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c'
[2023-04-20 14:28:11]  Creating new model...




[2023-04-20 14:28:17]  saving new model: train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c'.
2023/04/20 14:28:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c'.


[2023-04-20 14:28:30]  Logged model=train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c, version=1
[2023-04-20 14:28:30]  Building model for era bin: 2
[2023-04-20 14:28:30]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c'
[2023-04-20 14:28:30]  Creating new model...




[2023-04-20 14:28:35]  saving new model: train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c'.
2023/04/20 14:28:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c'.


[2023-04-20 14:28:48]  Logged model=train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c, version=1
[2023-04-20 14:28:48]  Building model for era bin: 3
[2023-04-20 14:28:48]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c'
[2023-04-20 14:28:48]  Creating new model...




[2023-04-20 14:28:53]  saving new model: train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c'.
2023/04/20 14:29:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c'.


[2023-04-20 14:29:07]  Logged model=train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c, version=1
[2023-04-20 14:29:07]  Building model for era bin: 4
[2023-04-20 14:29:07]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c'
[2023-04-20 14:29:07]  Creating new model...




[2023-04-20 14:29:12]  saving new model: train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c'.
2023/04/20 14:29:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c'.


[2023-04-20 14:29:25]  Logged model=train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c, version=1
[2023-04-20 14:29:25]  Building model for era bin: 5
[2023-04-20 14:29:25]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c'
[2023-04-20 14:29:25]  Creating new model...




[2023-04-20 14:29:30]  saving new model: train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c'.
2023/04/20 14:29:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c'.


[2023-04-20 14:29:44]  Logged model=train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c, version=1
[2023-04-20 14:29:44]  Building model for era bin: 6
[2023-04-20 14:29:44]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c'
[2023-04-20 14:29:44]  Creating new model...




[2023-04-20 14:29:48]  saving new model: train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c'.
2023/04/20 14:30:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c'.


[2023-04-20 14:30:02]  Logged model=train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c, version=1
[2023-04-20 14:30:02]  Building model for era bin: 7
[2023-04-20 14:30:02]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c'
[2023-04-20 14:30:02]  Creating new model...




[2023-04-20 14:30:07]  saving new model: train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c'.
2023/04/20 14:30:19 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c'.


[2023-04-20 14:30:19]  Logged model=train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c, version=1
[2023-04-20 14:30:19]  Building model for era bin: 8
[2023-04-20 14:30:19]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c'
[2023-04-20 14:30:19]  Creating new model...




[2023-04-20 14:30:24]  saving new model: train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c'.
2023/04/20 14:30:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c'.


[2023-04-20 14:30:37]  Logged model=train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c, version=1
[2023-04-20 14:30:37]  Building model for era bin: 9
[2023-04-20 14:30:37]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c'
[2023-04-20 14:30:37]  Creating new model...




[2023-04-20 14:30:42]  saving new model: train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c'.
2023/04/20 14:30:55 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c'.


[2023-04-20 14:30:55]  Logged model=train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c, version=1
[2023-04-20 14:30:55]  Building model for era bin: 10
[2023-04-20 14:30:55]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c'
[2023-04-20 14:30:55]  Creating new model...




[2023-04-20 14:31:00]  saving new model: train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c'.
2023/04/20 14:31:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c'.


[2023-04-20 14:31:14]  Logged model=train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c, version=1
[2023-04-20 14:31:14]  Building model for era bin: 11
[2023-04-20 14:31:14]  Checking for existing model 'train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c'
[2023-04-20 14:31:14]  Creating new model...




[2023-04-20 14:31:20]  saving new model: train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c


Successfully registered model 'train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c'.
2023/04/20 14:31:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c'.


[2023-04-20 14:31:34]  Logged model=train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c, version=1
[2023-04-20 14:31:34]  Target col target_nomi_v4_20


In [165]:
model_path = MODEL_DIR+"/train_int8"
mld_fls = os.listdir(model_path)
log.info(f"Created model files in {model_path} \nnum files = {len(mld_fls)}\n{mld_fls}")

[2023-04-20 15:34:24]  Created model files in ./data/experiments/era_analysis__2023-04-20_14h-20m/models//train_int8 
num files = 11
['train_int8_target_nomi_v4_20_erabin3_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin6_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin10_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin5_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin8_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin11_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin4_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin9_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin7_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin1_binsz50_52e08c.pkl', 'train_int8_target_nomi_v4_20_erabin2_binsz50_52e08c.pkl']


### 2.2 Look at the metrics for the trained erabin models

In [241]:
def to_metrics_df(model_data_map):
    res_df = pd.DataFrame(model_data_map).transpose()
    return pd.concat(res_df.loc[:, "validation_stats"].tolist(), axis=0)



In [242]:
eb_metrics_df = to_metrics_df(bin_model_data_map)
eb_metrics_df.index = eb_metrics_df.index.to_series().apply(lambda r: r[34:]+"_nomi")
eb_metrics_styled = ut.fmt_metrics_df(eb_metrics_df)
eb_metrics_styled

Unnamed: 0,mean,std,sharpe,max_drawdown,apy
erabin1_binsz50_nomi,-0.07%,1.92%,-3.80%,-45.41%,-438.49%
erabin2_binsz50_nomi,0.35%,1.72%,20.39%,-24.48%,1781.78%
erabin3_binsz50_nomi,0.25%,1.49%,17.09%,-20.46%,1267.32%
erabin4_binsz50_nomi,0.23%,1.57%,14.49%,-16.70%,1112.70%
erabin5_binsz50_nomi,1.00%,1.63%,61.29%,-13.96%,6174.78%
erabin6_binsz50_nomi,0.36%,1.76%,20.29%,-23.24%,1816.96%
erabin7_binsz50_nomi,0.87%,1.93%,44.94%,-22.61%,5130.24%
erabin8_binsz50_nomi,0.57%,1.71%,33.05%,-13.12%,3095.84%
erabin9_binsz50_nomi,0.41%,1.86%,22.20%,-20.53%,2139.88%
erabin10_binsz50_nomi,0.27%,1.60%,16.81%,-23.22%,1334.94%




In [117]:
comparison_dir = os.path.join(ARTIFACTS_DIR, "comparison")
os.makedirs(comparison_dir, exist_ok=True)
log.info(f"Making {comparison_dir}")

[2023-04-20 15:13:04]  Making ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison


In [118]:
eb_metrics_tbl_fl = os.path.join(comparison_dir, "erabin_metrics_tbl.html")
eb_metrics_styled.to_html(eb_metrics_tbl_fl)
log.info(f"Saving {eb_metrics_tbl_fl}")

[2023-04-20 15:13:06]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabin_metrics_tbl.html


In [None]:
def iplot_ebmetrics(erabin_metrics_df, annotate_bin=True):
    # Create a line chart with two y-axes on either side
    subset = erabin_metrics_df[["sharpe", "mean"]]
    fig_metric = subset.iplot(kind='line', title='Metrics', secondary_y=['mean'], asFigure=True)
    for _bin, (ix, sharpe, mean) in enumerate(zip(subset.index, subset.sharpe, subset["mean"]), start=1):
        bin_pfx = f"bin={_bin}, " if annotate_bin else ""
        fig_metric.add_annotation(
            x=ix,
            y=sharpe,
            text=f"{bin_pfx}sharpe={round(sharpe,3)}",
            showarrow=True,
            arrowhead=0
        )
    # Display the chart
    pyo.iplot(fig_metric)
    return fig_metric
    
fig_eb_mdl_metric = iplot_ebmetrics(eb_metrics_df)

In [259]:
eb_metrics_iplot_fl = os.path.join(comparison_dir, "erabin_metrics_plot.html")
fig_eb_mdl_metric.write_html(eb_metrics_iplot_fl)
log.info(f"Saving {eb_metrics_iplot_fl}")

[2023-04-20 16:29:18]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabin_metrics_plot.html


### 2.3 Filter out various proportions of era bins with bad sharpe and see if that improves performance

In [299]:
def get_erabin_from_name(name):
    # convert names like `erabin1_binsz50_nomi` -> 1 `erabin12_binsz50_nomi` -> 12
    return int(name.split("erabin")[1].split("_")[0])

eb_metrics_df["era_bin"] = eb_metrics_df.index.to_series().apply(get_erabin_from_name)



In [300]:
# sort the era bins in descending order of sharpe so that we use various proportions
# of them in training
sorted_erabins = eb_metrics_df.sort_values(by="sharpe", ascending=False).era_bin.to_list()
sorted_erabins

[5, 7, 8, 9, 11, 2, 6, 3, 10, 4, 1]



In [301]:
erabin_set_map = {
    ebp: sorted_erabins[:round(len(sorted_erabins)*ebp)]
    for ebp in ERA_BIN_PROPORTIONS    
}
log.info(f"These era bin sets will be used for training and compared against test2 \n{pformat(erabin_set_map)}")

[2023-04-20 16:57:03]  These era bin sets will be used for training and compared against test2 
{0.8: [5, 7, 8, 9, 11, 2, 6, 3, 10],
 0.9: [5, 7, 8, 9, 11, 2, 6, 3, 10, 4],
 0.95: [5, 7, 8, 9, 11, 2, 6, 3, 10, 4],
 1.0: [5, 7, 8, 9, 11, 2, 6, 3, 10, 4, 1]}


In [302]:
def build_model_for_erabinset(
    train_df,
    test_df: pd.DataFrame,
    era_bin_col: str,
    erabin_set_map: dict[float, list[int]],
    features: list[str],
    train_ident: str,
    target: str,
    model_rootdir: str,
    params: dict,
    expt_id: str,
):
    """Builds a model for each era bin set.
    
    :param erabin_sets: list of erabin to use.
    """
    binset_model_data_map = {}
    for binset_prop, erabin_set  in tqdm(erabin_set_map.items(), desc="Training model for each erabin_set"):
        # Get the train and test data for this era bin.
        eb_set_srted = erabin_set
        erabin_set = set(erabin_set)
        era_binset_train_df = train_df[train_df[era_bin_col].isin(erabin_set)]
        log.info(
            f"Building model for erabin set of prop {binset_prop} with eras {eb_set_srted}\n"
            f"Size of train set {len(era_binset_train_df):,}"
        )
        binset_model_data_map[binset_prop] = ut.build_model_for_target(
            train_df=era_binset_train_df,
            test_df=test_df,
            features=features,
            train_ident=train_ident,
            target=target,
            model_rootdir=model_rootdir,
            params=params,
            expt_id=expt_id,
            suffix=f"_erabinsetprop{binset_prop}_binsz{ERA_BIN_SZ}",
        )
    return binset_model_data_map



In [303]:
ebset_mdl_expt_id = mlflow.create_experiment(parameters["expt_name"]+"_erabinset_models")



In [304]:
binset_model_data_map = build_model_for_erabinset(
    train_df=train_mdl_df,
    test_df=test1_df,
    era_bin_col=ERA_BIN_COL,
    erabin_set_map=erabin_set_map,
    features=features,
    train_ident="train_int8",
    target=TARGET_COL,
    model_rootdir=MODEL_DIR,
    params=parameters,
    expt_id=ebset_mdl_expt_id,
)

Training model for each erabin_set:   0%|          | 0/4 [00:00<?, ?it/s]

[2023-04-20 16:57:09]  Building model for erabin set of prop 0.8 with eras [5, 7, 8, 9, 11, 2, 6, 3, 10]
Size of train set 20,457
[2023-04-20 16:57:09]  Checking for existing model 'train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f'
[2023-04-20 16:57:09]  Creating new model...
[2023-04-20 16:57:16]  saving new model: train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f
[2023-04-20 16:57:16]  Training time: 0.12 minutes


Successfully registered model 'train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f'.
2023/04/20 16:57:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f'.


[2023-04-20 16:57:30]  Logged model=train_int8_target_nomi_v4_20_erabinsetprop0.8_binsz50_57306f, version=1
[2023-04-20 16:57:30]  Building model for erabin set of prop 0.9 with eras [5, 7, 8, 9, 11, 2, 6, 3, 10, 4]
Size of train set 22,694
[2023-04-20 16:57:30]  Checking for existing model 'train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f'
[2023-04-20 16:57:30]  Creating new model...
[2023-04-20 16:57:37]  saving new model: train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f
[2023-04-20 16:57:37]  Training time: 0.11 minutes


Successfully registered model 'train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f'.
2023/04/20 16:57:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f'.


[2023-04-20 16:57:50]  Logged model=train_int8_target_nomi_v4_20_erabinsetprop0.9_binsz50_57306f, version=1
[2023-04-20 16:57:50]  Building model for erabin set of prop 0.95 with eras [5, 7, 8, 9, 11, 2, 6, 3, 10, 4]
Size of train set 22,694
[2023-04-20 16:57:50]  Checking for existing model 'train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f'
[2023-04-20 16:57:50]  Creating new model...
[2023-04-20 16:57:56]  saving new model: train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f
[2023-04-20 16:57:56]  Training time: 0.10 minutes


Successfully registered model 'train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f'.
2023/04/20 16:58:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f'.


[2023-04-20 16:58:09]  Logged model=train_int8_target_nomi_v4_20_erabinsetprop0.95_binsz50_57306f, version=1
[2023-04-20 16:58:09]  Building model for erabin set of prop 1.0 with eras [5, 7, 8, 9, 11, 2, 6, 3, 10, 4, 1]
Size of train set 24,198
[2023-04-20 16:58:09]  Checking for existing model 'train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f'
[2023-04-20 16:58:09]  Creating new model...
[2023-04-20 16:58:15]  saving new model: train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f
[2023-04-20 16:58:15]  Training time: 0.10 minutes


Successfully registered model 'train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f'.
2023/04/20 16:58:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f, version 1
Created version '1' of model 'train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f'.


[2023-04-20 16:58:28]  Logged model=train_int8_target_nomi_v4_20_erabinsetprop1.0_binsz50_57306f, version=1


In [305]:
ebset_metrics_df = to_metrics_df(binset_model_data_map)
ebset_metrics_df.index = ebset_metrics_df.index.to_series().apply(lambda r: r[34:])
ebset_metrics_styled = ut.fmt_metrics_df(ebset_metrics_df)
display(ebset_metrics_styled)
ebset_metrics_tbl_fl = os.path.join(comparison_dir, "erabinset_metrics_tbl.html")
ebset_metrics_styled.to_html(ebset_metrics_tbl_fl)
log.info(f"Saving {ebset_metrics_tbl_fl}")

Unnamed: 0,mean,std,sharpe,max_drawdown,apy
erabinsetprop0.8_binsz50,1.32%,2.01%,65.59%,-15.38%,8797.73%
erabinsetprop0.9_binsz50,1.33%,2.04%,65.08%,-14.73%,8880.18%
erabinsetprop0.95_binsz50,1.33%,2.04%,65.08%,-14.73%,8880.18%
erabinsetprop1.0_binsz50,1.24%,2.11%,58.65%,-15.81%,8069.60%


[2023-04-20 16:58:28]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabinset_metrics_tbl.html


In [306]:
fig_ebset_metric = iplot_ebmetrics(ebset_metrics_df, annotate_bin=False)
ebset_metrics_iplot_fl = os.path.join(comparison_dir, "erabinset_metrics_plot.html")
fig_ebset_metric.write_html(ebset_metrics_iplot_fl)
log.info(f"Saving {ebset_metrics_iplot_fl}")

[2023-04-20 16:58:28]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabinset_metrics_plot.html


### 2.4 Ensure we are not overfitting test1 by replotting metrics on test2 for the same erabinset_models

In [309]:
binset_prop_mdl_map = pd.DataFrame(binset_model_data_map).transpose().loc[:, "model"].to_dict()
log.info(f"binset_prop_mdl_map:\n{pformat(binset_prop_mdl_map)}")

[2023-04-20 16:58:45]  binset_prop_mdl_map:
{0.8: LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32),
 0.9: LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32),
 0.95: LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32),
 1.0: LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32)}


In [310]:
predn_cols = []
for prop, binset_mdl in tqdm(binset_prop_mdl_map.items(), desc="Predicting for each binset model"):
    # Predict with model
    #   add cross validation split to the prediction column name
    predn_col = f"{ut.get_pred_col(TARGET_COL)}_binset{prop}"
    predn_cols.append(predn_col)
    test2_df[predn_col] = ut.predict(
        pred_df=test2_df[features], model=binset_mdl, parameters=parameters
    )

Predicting for each binset model:   0%|          | 0/4 [00:00<?, ?it/s]



In [313]:
validation_stats = nmr_utils.validation_metrics(
    validation_data=test2_df,
    pred_cols=predn_cols,
    example_col=None,
    fast_mode=True,
    target_col=nmr_utils.TARGET_COL,
    include_mmc=False,
)
oos_styled = ut.fmt_metrics_df(validation_stats)
display(oos_styled)
tbl_fl = os.path.join(comparison_dir, "erabinset_outofsample_metrics_tbl.html")
oos_styled.to_html(tbl_fl)
log.info(f"Saving {tbl_fl}")

Unnamed: 0,mean,std,sharpe,max_drawdown,apy
pred_trained_on_target_nomi_v4_20_binset0.8,1.22%,2.43%,50.40%,-28.32%,7775.39%
pred_trained_on_target_nomi_v4_20_binset0.9,1.29%,2.50%,51.67%,-28.26%,8324.84%
pred_trained_on_target_nomi_v4_20_binset0.95,1.29%,2.50%,51.67%,-28.26%,8324.84%
pred_trained_on_target_nomi_v4_20_binset1.0,1.30%,2.60%,50.19%,-29.83%,8410.88%


[2023-04-20 17:03:57]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabinset_outofsample_metrics_tbl.html


In [315]:
fig_oos = iplot_ebmetrics(ebset_metrics_df, annotate_bin=False)
oos_iplot_fl = os.path.join(comparison_dir, "erabinset_outofsample_metrics_plot.html")
fig_oos.write_html(oos_iplot_fl)
log.info(f"Saving {oos_iplot_fl}")

[2023-04-20 17:04:47]  Saving ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabinset_outofsample_metrics_plot.html


In [316]:
!open ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/erabinset_outofsample_metrics_plot.html



In [317]:
comparison_expt_id = mlflow.create_experiment(parameters["expt_name"] + "_comparisons")



### 2.5 Save the comparison plots and trained models to mlflow/s3

#### Save model_data_maps to comparison folder
These contain the trained models, the erabin or binset they belong to along with validation data.

In [322]:
bin_dt_mp_fl = os.path.join(comparison_dir, "bin_model_data_map.pkl")
log.info(f"Pickling bin_model_data_map to {bin_dt_mp_fl}")
ut.pickle_obj(obj=bin_model_data_map, fl=bin_dt_mp_fl)

binset_dt_mp_fl = os.path.join(comparison_dir, "binset_model_data_map.pkl")
log.info(f"Pickling binset_model_data_map to {binset_dt_mp_fl}")
ut.pickle_obj(obj=binset_model_data_map, fl=binset_dt_mp_fl)

[2023-04-20 17:16:36]  Pickling bin_model_data_map to ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/bin_model_data_map.pkl
[2023-04-20 17:16:37]  Pickling binset_model_data_map to ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/comparison/binset_model_data_map.pkl


In [323]:
with mlflow.start_run(run_name="compare_bin_and_binset", experiment_id=comparison_expt_id):
    mlflow.log_params(flatdict.FlatDict(parameters, delimiter="."))
    mlflow.log_artifacts(local_dir=comparison_dir)



## Upload relevant files to s3

In [324]:
import utils as ut



In [326]:
ut.upload_to_s3_recursively(
    dir_path=EXPT_SAVE_ROOTDIR,
    s3_path=OUTPUT_S3_PATH,
    aws_credential_fl=AWS_CREDENTIALS_FILE,
    aws_profile='default',
    dry_run=TEST_MODE,
)

[2023-04-20 17:18:04]  Loading aws credenitals from ~/.aws/personal_credentials...
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_analysis__2023-04-20_14h-20m/params.json to s3://numerai-v1/experiments/params.json
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_analysis__2023-04-20_14h-20m/log.txt to s3://numerai-v1/experiments/log.txt
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/era_stock_cnt.html to s3://numerai-v1/experiments/artifacts/era_stock_cnt.html
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/era_nan_pct.html to s3://numerai-v1/experiments/artifacts/era_nan_pct.html
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_analysis__2023-04-20_14h-20m/artifacts/era_date_cnt.html to s3://numerai-v1/experiments/artifacts/era_date_cnt.html
[2023-04-20 17:18:04]  [DRYRUN] Would upload ./data/experiments/era_a