In [2]:
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

In [33]:
TO_TRAIN = True
RUN_NAME = "all_targets_medm_feats_every4th_int8"
AS_INT8 = True
FEATURE_SET_NAME = "medium"
TO_DOWNLOAD_DATA = True
TRAINING_PARAMS = {
    "neutralisation": 0.5,
    "sample_every_nth": 32,
    "val_fracs": [0.2, 0.2],
}
# small fast params
params_name = "sm_lgbm"
params = {"n_estimators": 5000,
          "learning_rate": 0.001,
          "max_depth": 6,
          "num_leaves": 2 ** 6,
          "colsample_bytree": 0.1}

# recommended params
# params_name = "lg_lgbm"
# params = {
#     "n_estimators": 20000,
#     "learning_rate": 0.001,
#     "max_depth": 6,
#     "num_leaves": 2**6,
#     "colsample_bytree": 0.1,
# }

# loop through all of our favorite targets and build models on each of them - one over training data, one over all available data
# for the train_data models, we'll then predict on validation data
# for the all_data models, we'll predict on live
targets = [
    "target_nomi_v4_20",
    "target_jerome_v4_60",
    "target_ralph_v4_20",
    "target_tyler_v4_20",
    "target_victor_v4_20",
    "target_waldo_v4_20",
]



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
!pwd

'/Users/vispers/work/numerai/numerai'



## Data loading

### 1. Download relevant datasets

In [36]:
data_fld_root = "../data"
dataset_name = "v4.1"
data_path = os.path.join(data_fld_root, dataset_name)



In [37]:
napi = NumerAPI()
current_round = napi.get_current_round()
print(f"Current round: {current_round}")

if not TO_DOWNLOAD_DATA:
    print("Not downloading data; assuming it exists already")
else:
    downloaded_fl_map = ss_utils.download_data(
        dataset_name=dataset_name, data_path=data_path, as_int8=AS_INT8,
    )
    print(downloaded_fl_map)

Current round: 459
Current round: 459
Downloading dataset files...
Downloading v4.1/train_int8.parquet to ../data/v4.1/train_int8.parquet...


2023-04-11 06:24:47,119 INFO numerapi.utils: target file already exists
2023-04-11 06:24:47,120 INFO numerapi.utils: download complete


Downloading v4.1/validation_int8.parquet to ../data/v4.1/validation_int8.parquet...


2023-04-11 06:24:47,872 INFO numerapi.utils: target file already exists
2023-04-11 06:24:47,874 INFO numerapi.utils: download complete


Downloading v4.1/features.json to ../data/v4.1/features.json...


2023-04-11 06:24:48,586 INFO numerapi.utils: target file already exists
2023-04-11 06:24:48,588 INFO numerapi.utils: download complete


Downloading v4.1/validation_example_preds.parquet to ../data/v4.1/validation_example_preds.parquet...


2023-04-11 06:24:49,322 INFO numerapi.utils: target file already exists
2023-04-11 06:24:49,325 INFO numerapi.utils: download complete


Downloading v4.1/live_int8.parquet to ../data/v4.1/459/live_int8.parquet...


2023-04-11 06:24:50,072 INFO numerapi.utils: target file already exists
2023-04-11 06:24:50,075 INFO numerapi.utils: download complete


{'train': '../data/v4.1/train_int8.parquet', 'test': '../data/v4.1/validation_int8.parquet', 'features_json': '../data/v4.1/features.json', 'val_example': '../data/v4.1/validation_example_preds.parquet', 'live': '../data/v4.1/459/live_int8.parquet'}


### 2. Load up training and test data. Split training data into train, validations splits.

In [29]:
data_map = cast(
    dict[str, pd.DataFrame],
    ss_utils.load_downloaded_data(
        downloaded_fl_map=downloaded_fl_map,
        cols_to_read=ss_utils.build_cols_to_read(
            feature_json_fl=os.path.join(data_path, "features.json"),
            feature_set_name=FEATURE_SET_NAME,
        ),
        val_fracs=TRAINING_PARAMS["val_fracs"],
        to_train=TO_TRAIN,
        sample_every_nth_era=TRAINING_PARAMS["sample_every_nth"],
    )

Reading live data ...
Reading training data ...
Reading test data ...
Splitting into train and validations


In [40]:
for split, df in data_map.items():
    display(HTML(f"<h5>Split: {split}</h5>"))
    print(f"Number of rows: {len(df):,}")
    display(df.head(2))

Number of rows: 42,565


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0,4,0,4,,0,1,4,0,3,...,0.25,0.0,0.333333,0.0,0.5,0.5,0.166667,0.0,1,train
n003bee128c2fcfc,4,2,2,2,,3,1,3,2,2,...,1.0,1.0,0.666667,0.666667,0.833333,0.666667,0.833333,0.666667,1,train


Number of rows: 13,792


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n0000a7873fbc168,3,1,3,1,0,0,0,4,3,1,...,0.5,0.5,0.5,0.5,0.333333,0.5,0.5,0.5,449,train
n00038990d1c4308,0,4,3,3,2,3,2,3,3,3,...,0.5,0.5,0.5,0.5,0.5,0.333333,0.5,0.5,449,train


Number of rows: 18,046


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n0004b2cf9bd1d3b,3,1,4,1,3,2,3,3,4,1,...,0.5,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.5,353,train
n00105becd525868,2,4,4,3,4,4,3,3,4,3,...,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.5,0.5,353,train


Number of rows: 81,509


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,0,0,1,0,3,1,0,0,1,2,...,0.5,0.5,0.666667,0.5,0.5,0.5,0.666667,0.5,575,validation
n001e1318d5072ac,1,4,3,1,3,1,4,4,3,1,...,0.0,0.5,0.166667,0.333333,0.333333,0.166667,0.166667,0.333333,575,validation


Number of rows: 5,173


Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,...,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000ccedd86fc94f,4,4,4,1,0,3,3,2,4,4,...,,,,,,,,,X,live
n00176ebf594813d,2,4,0,1,2,3,4,1,0,3,...,,,,,,,,,X,live




In [31]:
for split, df in data_map.items():
    era_col = df[ERA_COL]
    display(HTML(f"<h5>{split}</h5>"))
    print(f"{era_col.min()=}, {era_col.max()=}")
    display(era_col.describe().to_frame())

era_col.min()='0001', era_col.max()='0321'


Unnamed: 0,era
count,42565
unique,11
top,257
freq,4841


era_col.min()='0449', era_col.max()='0545'


Unnamed: 0,era
count,13792
unique,3
top,545
freq,4685


era_col.min()='0353', era_col.max()='0481'


Unnamed: 0,era
count,18046
unique,4
top,417
freq,4645


era_col.min()='0575', era_col.max()='1055'


Unnamed: 0,era
count,81509
unique,16
top,959
freq,5398


era_col.min()='X', era_col.max()='X'


Unnamed: 0,era
count,5173
unique,1
top,X
freq,5173




### 4. Impute NAs with median values as int8 cannot handle NAs

In [None]:
print("cleaning up NAs in live data...")
live_data[features] = live_data[features].fillna(
    all_data[features].median(skipna=True)
)  # since live data is only one era, we need to use the median for all eras
live_data[features] = live_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!
# Alternatively could convert nan columns to be floats and replace pd.NA with np.nan

if TO_TRAIN:
    print("cleaning up NAs in train and validation data...")
    # Int8 datatype has pd.NA which don't play nice with models.  We simply fill NA with median values here
    all_data[features] = all_data[features].fillna(all_data[features].median(skipna=True))
    all_data[features] = all_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!

## Training

In [None]:
if TO_TRAIN:
    mlflow.start_run(run_name=RUN_NAME)

### 1. Fast train the model with different targets

In [None]:
if not TO_TRAIN:
    print("Not training models...")
else:
    print("Training models and saving them...")
    mlflow.log_params(params)
    mlflow.log_param("targets", targets)
    mlflow.log_params(TRAINING_PARAMS)
    for target in tqdm(targets):
        prediction_col = f"{params_name}_{dataset_name}_{feature_set_name}_{target}"
        train_data_model_name = f"train_data_{prediction_col}"
        print(f"Checking for existing model '{train_data_model_name}'")
        train_model = load_model(train_data_model_name)
        if not train_model:
            print(f"model not found, creating new one")
            train_model = LGBMRegressor(**params)
            # train on all of train and save the model so we don't have to train next time
            target_train_index = (
                all_data.loc[training_index, target].dropna().index
            )  # make sure we only train on rows which have this target
            train_model.fit(
                all_data.loc[target_train_index, features],
                all_data.loc[target_train_index, target],
            )  # in case some of the targets are missing data
            print(f"saving new model: {train_data_model_name}")
            save_model(train_model, train_data_model_name)

        # predict on validation data
        all_data.loc[validation_index, prediction_col] = train_model.predict(
            all_data.loc[validation_index, features]
        )
        gc.collect()

In [None]:
def get_pred_col_name(target, params_name, dataset_name, feature_set_name):
    return f"{params_name}_{dataset_name}_{feature_set_name}_{target}"

In [None]:
for target in tqdm(targets):
    prediction_col = get_pred_col_name(target, params_name, dataset_name, feature_set_name)
    # do the same thing for all data (for predicting on live)
    all_data_model_name = f"all_data_{prediction_col}"
    print(f"Checking for existing model '{all_data_model_name}'")
    all_data_model = load_model(all_data_model_name)
    if not all_data_model:
        print(f"model not found, creating new one")
        raise ValueError("Model is not trained and saved, switch to TO_TRAIN mode and train the model.")
    # predict on live data
    live_data[prediction_col] = all_data_model.predict(
        live_data[features].fillna(np.nan)
    )  # filling live data with nans makes us ignore those features if necessary
    gc.collect()

### 2. Equal weight the different targets and then neutralise 50% of the features

In [None]:
def get_raw_pred_cols():
    """These are the columns of the output dataframe where predicted values are stored"""
    return [
        get_pred_col_name(
            target=tgt,
            params_name=params_name,
            dataset_name=dataset_name,
            feature_set_name=feature_set_name,
        )
        for tgt in targets
    ]


def get_all_pred_cols():
    """These include the prediction columns in :meth:`get_raw_pred_cols` but
    also derived columns from neutralisation.
    """
    return (
        ["equal_weight", "half_neutral_equal_weight"]
        + get_pred_col_name(
            targets=targets,
            params_name=params_name,
            dataset_name=dataset_name,
            feature_set_name=feature_set_name,
        )
    )


In [None]:
model_to_submit = f"half_neutral_equal_weight"
if TO_TRAIN:
    data_w_ixes = [(live_data, live_data.index), (all_data, validation_index)]
else:
    data_w_ixes = [(live_data, live_data.index)]

for df, ixes in data_w_ixes:
    # make an ensemble
    df.loc[:, "equal_weight"] = df[get_raw_pred_cols()].mean(axis=1)
    # make a 50% feature neutral variation of the ensemble model
    df["half_neutral_equal_weight"] = neutralize(
        df=df.loc[ixes, :],
        columns=[f"equal_weight"],
        neutralizers=features,
        proportion=TRAINING_PARAMS["neutralisation"],
        normalize=True,
        era_col=ERA_COL,
        verbose=True,
    )

### 3. Make the predictions submission ready for numer.ai website

Convert regressed values to rank values

In [None]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
if TO_TRAIN:
    print("rename best model to 'prediction' and rank from 0 to 1 to meet upload requirements")
    all_data.loc[validation_index, "prediction"] = all_data.loc[
        validation_index, model_to_submit
    ].rank(pct=True)
    all_data.loc[validation_index, "prediction"].to_csv(
        f"validation_predictions_{current_round}.csv"
    )
    validation_example_preds = pd.read_parquet(
        os.path.join(data_path, f"validation_example_preds.parquet"),
    )
    all_data.loc[validation_index, EXAMPLE_PREDS_COL] = validation_example_preds[
        "prediction"
    ]

In [None]:
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

In [None]:
pwd

In [None]:
ls

### 4. Print some metrics

In [None]:
if TO_TRAIN:
    # get some stats about each of our models to compare...
    # fast_mode=True so that we skip some of the stats that are slower to calculate
    validation_stats = validation_metrics(
        all_data.loc[validation_index, :],
        prediction_cols,
        example_col=EXAMPLE_PREDS_COL,
        fast_mode=True,
        target_col=TARGET_COL,
    )
    print(validation_stats[["mean", "sharpe"]].to_markdown())

    print(
        f"""
    Done! Next steps:
        1. Go to numer.ai/tournament (make sure you have an account)
        2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
        3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
    """
    )

In [None]:
validation_stats

In [None]:
mlflow.active_run().info.run_id

In [None]:
out_folder = f"outputs/{mlflow.active_run().info.run_id}/"
val_stats_html_path = os.path.join(out_folder, "metrics.html")
os.makedirs(out_folder, exist_ok=True)
validation_stats.to_html(buf=val_stats_html_path)
mlflow.log_artifact(local_path=val_stats_html_path)

#### Log metrics

In [None]:
for ix in validation_stats.index:
    for col in validation_stats.columns:
        mlflow.log_metric(f"{col}__{ix}", validation_stats.loc[ix, col])

In [None]:
validation_metrics(
    validation_data=all_data.loc[validation_index, :],
    pred_cols=[EXAMPLE_PREDS_COL],
    example_col=EXAMPLE_PREDS_COL,
    fast_mode=True,
    target_col=TARGET_COL,
)

In [None]:
validation_stats.to_html("validation_stats.html")

In [None]:
mlflow.end_run()