In [3]:
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))



In [32]:
TO_TRAIN = True
RUN_NAME = "single_target_medm_feats_every4th_int8"
TO_DOWNLOAD_DATA = True
TRAINING_PARAMS = {
    "neutralisation": 0.5,
    "sample_every4": True,
}
# small fast params
params_name = "sm_lgbm"
params = {"n_estimators": 5000,
          "learning_rate": 0.001,
          "max_depth": 6,
          "num_leaves": 2 ** 6,
          "colsample_bytree": 0.1}

# recommended params
# params_name = "lg_lgbm"
# params = {
#     "n_estimators": 20000,
#     "learning_rate": 0.001,
#     "max_depth": 6,
#     "num_leaves": 2**6,
#     "colsample_bytree": 0.1,
# }

# loop through all of our favorite targets and build models on each of them - one over training data, one over all available data
# for the train_data models, we'll then predict on validation data
# for the all_data models, we'll predict on live
targets = [
    "target_nomi_v4_20",
#     "target_jerome_v4_60",
#     "target_ralph_v4_20",
#     "target_tyler_v4_20",
#     "target_victor_v4_20",
#     "target_waldo_v4_20",
]



In [7]:
from lightgbm import LGBMRegressor
import pandas as pd
import dask.dataframe as dd
import gc
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os.path

from numerapi import NumerAPI
import mlflow
from utils import (
    save_model,
    load_model,
    neutralize,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL,
)



In [8]:
pwd

'/Users/vispers/work/numerai/numerai'



## Data loading

### 1. Download relevant datasets

In [9]:
feature_set_name = "medium"
data_fld_root = "../data"
dataset_name = "v4.1"
data_path = os.path.join(data_fld_root, dataset_name)



In [10]:
napi = NumerAPI()
current_round = napi.get_current_round()
print(f"Current round: {current_round}")

if not TO_DOWNLOAD_DATA:
    print("Not downloading data; assuming it exists already")
else:
    # Tournament data changes every week so we specify the round in their name. Training
    # and validation data only change periodically, so no need to download them every time.
    print("Downloading dataset files...")
    # we'll use the int8 in this example in order to save RAM.
    # if you remove the int8 suffix for each of these files, you'll get features between 0 and 1 as floats.
    # int_8 files are much smaller...
    # but are harder to work with because some packages don't like ints and the way NAs are encoded.

    # napi.download_dataset(f"{dataset_name}/train.parquet")
    # napi.download_dataset(f"{dataset_name}/validation.parquet")
    # napi.download_dataset(f"{dataset_name}/live.parquet", f"{dataset_name}/live_{current_round}.parquet")

    for fl in [
        "train_int8.parquet",
        "validation_int8.parquet", 
        "features.json",
        "validation_example_preds.parquet",
        "features.json",
    ]:
        print(f"{fl}")
        napi.download_dataset(
            os.path.join(dataset_name, fl),
            dest_path=os.path.join(data_path, fl)
        )

    napi.download_dataset(
        f"{dataset_name}/live_int8.parquet",
        os.path.join(data_path, f"{current_round}/live_int8.parquet"),
    )

Current round: 458
Downloading dataset files...
train_int8.parquet


2023-04-08 08:44:53,377 INFO numerapi.utils: target file already exists
2023-04-08 08:44:53,380 INFO numerapi.utils: download complete


validation_int8.parquet


2023-04-08 08:44:54,809 INFO numerapi.utils: target file already exists
2023-04-08 08:44:54,812 INFO numerapi.utils: download complete


features.json


2023-04-08 08:44:56,236 INFO numerapi.utils: target file already exists
2023-04-08 08:44:56,238 INFO numerapi.utils: download complete


validation_example_preds.parquet


2023-04-08 08:44:57,730 INFO numerapi.utils: target file already exists
2023-04-08 08:44:57,732 INFO numerapi.utils: download complete


features.json


2023-04-08 08:44:59,068 INFO numerapi.utils: target file already exists
2023-04-08 08:44:59,071 INFO numerapi.utils: download complete
2023-04-08 08:45:00,672 INFO numerapi.utils: target file already exists
2023-04-08 08:45:00,673 INFO numerapi.utils: download complete




### 2. Load up training and validation data

In [12]:
# read the feature metadata and get a feature set (or all the features)
with open(os.path.join(data_path, "features.json"), "r") as f:
    feature_metadata = json.load(f)

# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"][feature_set_name]  # get the medium feature set
target_cols = feature_metadata["targets"]
# read in just those features along with era and target columns
read_columns = features + target_cols + [ERA_COL, DATA_TYPE_COL]



In [19]:
if not TO_TRAIN:
    print("Not loading training data")
else:
    print("Reading minimal training data")
    # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
    # if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
    print("Reading training data ...")
    training_data = dd.read_parquet(os.path.join(data_path, "train_int8.parquet"), columns=read_columns)
    print("Reading validation data ...")
    validation_data = dd.read_parquet(os.path.join(data_path, "validation_int8.parquet"), columns=read_columns)

Reading minimal training data
Reading training data ...
Reading validation data ...


In [20]:
print("Reading live data ...")
live_data = dd.read_parquet(os.path.join(data_path, f"{current_round}/live_int8.parquet"), columns=read_columns)

Reading live data ...


In [21]:
if TO_TRAIN:
    display(training_data.head(2))
    display(validation_data.head(2))
display(live_data.head(2))

Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0,4,0,4,,0,1,4,0,3,...,0.25,0.0,0.333333,0.0,0.5,0.5,0.166667,0.0,1,train
n003bee128c2fcfc,4,2,2,2,,3,1,3,2,2,...,1.0,1.0,0.666667,0.666667,0.833333,0.666667,0.833333,0.666667,1,train


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,0,0,1,0,3,1,0,0,1,2,...,0.5,0.5,0.666667,0.5,0.5,0.5,0.666667,0.5,575,validation
n001e1318d5072ac,1,4,3,1,3,1,4,4,3,1,...,0.0,0.5,0.166667,0.333333,0.333333,0.166667,0.166667,0.333333,575,validation


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n001b3b98ab6bc91,0,0,1,4,1,0,2,2,1,0,...,,,,,,,,,X,live
n001eb0ac2f58573,1,0,2,4,2,4,3,3,2,1,...,,,,,,,,,X,live




### 3. Subsample training and validation data and store in single DF

In [22]:
if TO_TRAIN:
    # features = list(feature_metadata["feature_stats"].keys()) # get all the features
    # reduce the number of eras to every 4th era to speed things up... uncomment these lines to speed things up.
    if TRAINING_PARAMS["sample_every4"]:
        print("Subsampling every fourth era...")
        every_4th_era = set(training_data[ERA_COL].unique()[::4])
        training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
        every_4th_era = set(validation_data[ERA_COL].unique()[::4])
        validation_data = validation_data[validation_data[ERA_COL].isin(every_4th_era)]


        print("Subsampling every fourth era...")
    # get all the data to possibly use for training
    all_data = dd.concat([training_data, validation_data])

    # save indices for easier data selection later
    training_index = training_data.index.compute()
    validation_index = validation_data.index.compute()
    all_index = all_data.index.compute()

    # delete training and validation data to save space
    del training_data
    del validation_data
    gc.collect()  # clear up memory

Subsampling every fourth era...
Subsampling every fourth era...


In [23]:
if TO_TRAIN:
    for name, ixes in [("all_ixes", all_index), ("train_ixes", training_index), ("val_ixes", validation_index)]:
        era_col = all_data.loc[ixes, ERA_COL]
        display(HTML(f"<h5>{name}</h5>"))
        print(f"{era_col.min()=}, {era_col.max()=}")
        display(era_col.describe().to_frame())

era_col.min()=dd.Scalar<series-..., type=str>, era_col.max()=dd.Scalar<series-..., type=str>


Unnamed: 0_level_0,era
npartitions=1,Unnamed: 1_level_1
,object
,...


era_col.min()=dd.Scalar<series-..., type=str>, era_col.max()=dd.Scalar<series-..., type=str>


Unnamed: 0_level_0,era
npartitions=1,Unnamed: 1_level_1
,object
,...


era_col.min()=dd.Scalar<series-..., type=str>, era_col.max()=dd.Scalar<series-..., type=str>


Unnamed: 0_level_0,era
npartitions=1,Unnamed: 1_level_1
,object
,...




### 4. Impute NAs with median values as int8 cannot handle NAs

In [38]:
print("cleaning up NAs in live data...")
live_data[features] = live_data[features].fillna(0.5)  # since live data is only one era, we need to use the median for all eras
live_data[features] = live_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!
# Alternatively could convert nan columns to be floats and replace pd.NA with np.nan

if TO_TRAIN:
    print("cleaning up NAs in train and validation data...")
    # Int8 datatype has pd.NA which don't play nice with models.  We simply fill NA with median values here
    all_data[features] = all_data[features].fillna(0.5)
    all_data[features] = all_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!

cleaning up NAs in live data...
cleaning up NAs in train and validation data...


## Training

In [None]:
if TO_TRAIN:
    mlflow.start_run(run_name=RUN_NAME)

### 1. Fast train the model with different targets

In [37]:
if not TO_TRAIN:
    print("Not training models...")
else:
    print("Training models and saving them...")
    mlflow.log_params(params)
    mlflow.log_param("targets", targets)
    mlflow.log_params(TRAINING_PARAMS)
    for target in tqdm(targets):
        prediction_col = f"{params_name}_{dataset_name}_{feature_set_name}_{target}"
        train_data_model_name = f"train_data_{prediction_col}"
        print(f"Checking for existing model '{train_data_model_name}'")
        train_model = load_model(train_data_model_name)
        if not train_model:
            print(f"model not found, creating new one")
            train_model = LGBMRegressor(**params)
            # train on all of train and save the model so we don't have to train next time
            target_train_index = (
                all_data.loc[training_index, target].dropna().index
            )  # make sure we only train on rows which have this target
            train_model.fit(
                all_data.loc[target_train_index, features],
                all_data.loc[target_train_index, target],
            )  # in case some of the targets are missing data
            print(f"saving new model: {train_data_model_name}")
            save_model(train_model, train_data_model_name)

        # predict on validation data
        all_data.loc[validation_index, prediction_col] = train_model.predict(
            all_data.loc[validation_index, features]
        )
        gc.collect()

Training models and saving them...


  0%|          | 0/1 [00:00<?, ?it/s]

Checking for existing model 'train_data_sm_lgbm_v4.1_medium_target_nomi_v4_20'


TypeError: Cannot interpret 'Int8Dtype()' as a data type



In [68]:
def get_pred_col_name(target, params_name, dataset_name, feature_set_name):
    return f"{params_name}_{dataset_name}_{feature_set_name}_{target}"



In [63]:
for target in tqdm(targets):
    prediction_col = get_pred_col_name(target, params_name, dataset_name, feature_set_name)
    # do the same thing for all data (for predicting on live)
    all_data_model_name = f"all_data_{prediction_col}"
    print(f"Checking for existing model '{all_data_model_name}'")
    all_data_model = load_model(all_data_model_name)
    if not all_data_model:
        print(f"model not found, creating new one")
        raise ValueError("Model is not trained and saved, switch to TO_TRAIN mode and train the model.")
    # predict on live data
    live_data[prediction_col] = all_data_model.predict(
        live_data[features].fillna(np.nan)
    )  # filling live data with nans makes us ignore those features if necessary
    gc.collect()

  0%|          | 0/6 [00:00<?, ?it/s]

Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_nomi_v4_20'
Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_jerome_v4_60'
Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_ralph_v4_20'
Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_tyler_v4_20'
Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_victor_v4_20'
Checking for existing model 'all_data_sm_lgbm_v4.1_medium_target_waldo_v4_20'


### 2. Equal weight the different targets and then neutralise 50% of the features

In [70]:
def get_raw_pred_cols():
    """These are the columns of the output dataframe where predicted values are stored"""
    return [
        get_pred_col_name(
            target=tgt,
            params_name=params_name,
            dataset_name=dataset_name,
            feature_set_name=feature_set_name,
        )
        for tgt in targets
    ]


def get_all_pred_cols():
    """These include the prediction columns in :meth:`get_raw_pred_cols` but
    also derived columns from neutralisation.
    """
    return (
        ["equal_weight", "half_neutral_equal_weight"]
        + get_pred_col_name(
            targets=targets,
            params_name=params_name,
            dataset_name=dataset_name,
            feature_set_name=feature_set_name,
        )
    )




In [71]:
model_to_submit = f"half_neutral_equal_weight"
if TO_TRAIN:
    data_w_ixes = [(live_data, live_data.index), (all_data, validation_index)]
else:
    data_w_ixes = [(live_data, live_data.index)]

for df, ixes in data_w_ixes:
    # make an ensemble
    df.loc[:, "equal_weight"] = df[get_raw_pred_cols()].mean(axis=1)
    # make a 50% feature neutral variation of the ensemble model
    df["half_neutral_equal_weight"] = neutralize(
        df=df.loc[ixes, :],
        columns=[f"equal_weight"],
        neutralizers=features,
        proportion=TRAINING_PARAMS["neutralisation"],
        normalize=True,
        era_col=ERA_COL,
        verbose=True,
    )

  df.loc[:, "equal_weight"] = df[get_raw_pred_cols()].mean(axis=1)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.01it/s]
  df["half_neutral_equal_weight"] = neutralize(




### 3. Make the predictions submission ready for numer.ai website

Convert regressed values to rank values

In [15]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
if TO_TRAIN:
    print("rename best model to 'prediction' and rank from 0 to 1 to meet upload requirements")
    all_data.loc[validation_index, "prediction"] = all_data.loc[
        validation_index, model_to_submit
    ].rank(pct=True)
    all_data.loc[validation_index, "prediction"].to_csv(
        f"validation_predictions_{current_round}.csv"
    )
    validation_example_preds = pd.read_parquet(
        os.path.join(data_path, f"validation_example_preds.parquet"),
    )
    all_data.loc[validation_index, EXAMPLE_PREDS_COL] = validation_example_preds[
        "prediction"
    ]

  all_data.loc[validation_index, "prediction"] = all_data.loc[
  live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
  all_data.loc[validation_index, EXAMPLE_PREDS_COL] = validation_example_preds[




In [74]:
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

  live_data["prediction"] = live_data[model_to_submit].rank(pct=True)




In [75]:
pwd

'/Users/vispers/work/numerai/numerai'



In [76]:
ls

__init__.py                     mlruns/
__pycache__/                    modelling-sunshine.ipynb
analysis_and_tips.ipynb         modelling-v1.ipynb
data/                           modelling_utils.py
download_numerai_dataset.py     models/
example-model-advanced.ipynb    outputs/
live_predictions_455.csv        utils.py
live_predictions_458.csv        validation_predictions_455.csv
metrics.py


### 4. Print some metrics

In [16]:
if TO_TRAIN:
    # get some stats about each of our models to compare...
    # fast_mode=True so that we skip some of the stats that are slower to calculate
    validation_stats = validation_metrics(
        all_data.loc[validation_index, :],
        prediction_cols,
        example_col=EXAMPLE_PREDS_COL,
        fast_mode=True,
        target_col=TARGET_COL,
    )
    print(validation_stats[["mean", "sharpe"]].to_markdown())

    print(
        f"""
    Done! Next steps:
        1. Go to numer.ai/tournament (make sure you have an account)
        2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
        3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
    """
    )

|                                         |      mean |   sharpe |
|:----------------------------------------|----------:|---------:|
| sm_lgbm_v4.1_medium_target_nomi_v4_20   | 0.0257037 | 0.718577 |
| sm_lgbm_v4.1_medium_target_jerome_v4_60 | 0.0235539 | 0.840822 |
| sm_lgbm_v4.1_medium_target_ralph_v4_20  | 0.0264948 | 0.789863 |
| sm_lgbm_v4.1_medium_target_tyler_v4_20  | 0.0235424 | 0.678123 |
| sm_lgbm_v4.1_medium_target_victor_v4_20 | 0.0285276 | 0.942859 |
| sm_lgbm_v4.1_medium_target_waldo_v4_20  | 0.0259589 | 0.790595 |
| equal_weight                            | 0.0276816 | 0.812603 |
| half_neutral_equal_weight               | 0.0287806 | 0.940548 |

Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_455.csv to the diagnostics tool
    3. Submit tournament_predictions_455.csv to the "Upload Predictions" button



In [17]:
validation_stats

Unnamed: 0,mean,std,sharpe,max_drawdown,apy,mmc_mean,corr_plus_mmc_sharpe,corr_with_example_preds,exposure_dissimilarity_mean
sm_lgbm_v4.1_medium_target_nomi_v4_20,0.025704,0.03577,0.718577,-0.097479,233.218168,,,0.720733,-0.598389
sm_lgbm_v4.1_medium_target_jerome_v4_60,0.023554,0.028013,0.840822,-0.041424,204.415494,,,0.604949,-0.496763
sm_lgbm_v4.1_medium_target_ralph_v4_20,0.026495,0.033544,0.789863,-0.086138,247.215804,,,0.707534,-0.666071
sm_lgbm_v4.1_medium_target_tyler_v4_20,0.023542,0.034717,0.678123,-0.097209,201.233089,,,0.67407,-0.731769
sm_lgbm_v4.1_medium_target_victor_v4_20,0.028528,0.030256,0.942859,-0.057966,284.16718,,,0.76317,-0.219079
sm_lgbm_v4.1_medium_target_waldo_v4_20,0.025959,0.032835,0.790595,-0.095604,238.877693,,,0.699111,-0.726325
equal_weight,0.027682,0.034065,0.812603,-0.074339,267.003844,,,0.753576,-0.748392
half_neutral_equal_weight,0.028781,0.0306,0.940548,-0.051664,288.595137,,,0.827405,-0.267703




In [30]:
mlflow.active_run().info.run_id

'f3827ac21c4248189889f3a6c6a36576'



In [36]:
out_folder = f"outputs/{mlflow.active_run().info.run_id}/"
val_stats_html_path = os.path.join(out_folder, "metrics.html")
os.makedirs(out_folder, exist_ok=True)
validation_stats.to_html(buf=val_stats_html_path)
mlflow.log_artifact(local_path=val_stats_html_path)



#### Log metrics

In [18]:
for ix in validation_stats.index:
    for col in validation_stats.columns:
        mlflow.log_metric(f"{col}__{ix}", validation_stats.loc[ix, col])



In [19]:
validation_metrics(
    validation_data=all_data.loc[validation_index, :],
    pred_cols=[EXAMPLE_PREDS_COL],
    example_col=EXAMPLE_PREDS_COL,
    fast_mode=True,
    target_col=TARGET_COL,
)

Unnamed: 0,mean,std,sharpe,max_drawdown,apy,mmc_mean,corr_plus_mmc_sharpe,corr_with_example_preds,exposure_dissimilarity_mean
example_preds,0.032386,0.030466,1.063026,-0.044031,360.733121,,,1.0,0.0




In [22]:
validation_stats.to_html("validation_stats.html")



In [37]:
mlflow.end_run()

