## Tutorial - Experiment Tracking Using MLFlow on Numerai Dataset

This notebook serves as an example on how to:
- Train an XGBoost regressor
- Perform hyperparameter tuning using HyperOpt
- Track and log experiments (model parameters, metrics, and artifacts) using MLFlow

### Libraries

In [None]:
!pip install xgboost==1.7.5 mlflow hyperopt numerapi;

In [2]:
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib

from numerapi import NumerAPI
napi = NumerAPI()

import warnings
warnings.filterwarnings("ignore")

In [None]:
# # Download datasets
# napi.download_dataset("v4.2/features.json", "datasets/features.json")
# napi.download_dataset("v4.2/train_int8.parquet", "datasets/train_int8.parquet)

### Training Features

In [3]:
import json

feature_metadata = json.load(open("datasets/features.json"))
feature_sets = feature_metadata["feature_sets"]

feature_set_list = ["all"]
train_features = [feature for condition in feature_set_list for feature in feature_sets[condition]]

train_features = set(train_features)
print("# of training features:", len(train_features))

# of training features: 2132


### Training Data

**Note**: We'll only be using a small subset of the training data for this notebook to speed-up training and save memory.

In [4]:
train_df = pd.read_parquet("datasets/train_int8.parquet")
print(train_df.shape)
train_df.head()

(2420521, 2191)


Unnamed: 0_level_0,era,data_type,feature_honoured_observational_balaamite,feature_polaroid_vadose_quinze,feature_untidy_withdrawn_bargeman,feature_genuine_kyphotic_trehala,feature_unenthralled_sportful_schoolhouse,feature_divulsive_explanatory_ideologue,feature_ichthyotic_roofed_yeshiva,feature_waggly_outlandish_carbonisation,...,target_jeremy_v4_20,target_jeremy_v4_60,target_teager_v4_20,target_teager_v4_60,target_agnes_v4_20,target_agnes_v4_60,target_claudia_v4_20,target_claudia_v4_60,target_rowan_v4_20,target_rowan_v4_60
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,1,train,4,2,4,4,0,0,4,4,...,0.25,0.25,0.5,0.75,0.25,0.0,0.5,0.5,0.5,0.75
n003bee128c2fcfc,1,train,2,4,1,3,0,3,2,3,...,0.75,1.0,1.0,0.75,1.0,1.0,1.0,0.75,1.0,0.75
n0048ac83aff7194,1,train,2,1,3,0,3,0,3,3,...,0.5,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25
n00691bec80d3e02,1,train,4,2,2,3,0,4,1,4,...,0.5,0.5,0.75,0.75,0.5,0.5,0.75,0.75,0.75,0.5
n00b8720a2fdc4f2,1,train,4,3,4,4,0,0,4,2,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


#### Quick Pre-Processing

In [5]:
# Sample every other era
train_df = train_df[train_df["era"].isin(train_df["era"].unique()[::2])]
train_df['era'] = train_df['era'].astype(int)

# Remove targets == 0.50
target_mask = train_df['target'] != 0.50
train_df = train_df[target_mask]

# Keep eras >= 201
era201_mask = train_df['era'] >= 201
train_df = train_df[era201_mask]

print(train_df.shape)

(423864, 2191)


In [6]:
era_cutoff = 500
train_df_pre_cutoff = train_df[train_df['era'] < era_cutoff]
train_df_post_cutoff = train_df[train_df['era'] >= era_cutoff]

### Numerai Correlation
- We'll use MLFlow to track models with the highest Numerai correlation (primary tournament metric)

In [7]:
from scipy import stats
import numpy as np

def numerai_corr(preds, target):
    ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
    gauss_ranked_preds = stats.norm.ppf(ranked_preds)
    centered_target = target - target.mean()
    preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
    target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
    return np.corrcoef(preds_p15, target_p15)[0, 1]

In [8]:
# Create prediction DataFrame 
prediction_df = train_df_post_cutoff[['era', 'target_cyrus_v4_20']]
prediction_df.head(3)

Unnamed: 0_level_0,era,target_cyrus_v4_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1
n0013d17441d91b3,501,0.0
n0022c46b7b17b49,501,0.25
n004dd2c9bbe12d5,501,0.75


### XGBoost Training + HyperOpt + MLFlow

In [9]:
!rm -rf mlruns # delete mlruns folder if needed

In [10]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.xgboost

# Set an experiment name (need to use later to track results)
experiment_name = "numerai_test_run_0"
mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///notebooks/MLFlow%20Test/mlruns/426164893399017906', creation_time=1701288974927, experiment_id='426164893399017906', last_update_time=1701288974927, lifecycle_stage='active', name='numerai_test_run_0', tags={}>

#### Setup Hyperparameter Search Space for HyperOpt
- can add/remove hyperparameters as needed

In [11]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 7, 1),
    'learning_rate': hp.uniform('learning_rate', 0.005, 0.02),
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'subsample': hp.uniform('subsample', 0.7, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1)
}

def objective(space):
    # Start MLFlow
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(space)

        model = xgb.XGBRegressor(
            max_depth=int(space['max_depth']),
            learning_rate=space['learning_rate'],
            n_estimators=int(space['n_estimators']),
            subsample=space['subsample'],
            colsample_bytree=space['colsample_bytree'],
            tree_method='gpu_hist'
        )
        
        ### IMPORTANT SET UP TARGET AND TRAINING FEATURES ###
        target = 'target_cyrus_v4_20'
        train_features = train_features
        
        # train on pre-cutoff data
        model.fit(train_df_pre_cutoff[train_features], train_df_pre_cutoff[target])
        
        # predict and evaluate post-cutoff data
        prediction_df['prediction'] = model.predict(train_df_post_cutoff[train_features])
        mse = mean_squared_error(train_df_post_cutoff[target], prediction_df['prediction'])
        corr = numerai_corr(prediction_df['prediction'], train_df_post_cutoff[target])

        mlflow.log_metric("mse", mse)
        mlflow.log_metric("numerai_corr", corr)

    return {'loss': mse, 'corr': corr, 'status': STATUS_OK}

#### Begin training with HyperOpt + experiment tracking with MLFlow

In [12]:
trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals= 20,
                        trials=trials)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

2023-11-29 20:16:14,951 INFO hyperopt.tpe: build_posterior_wrapper took 0.001969 seconds
2023-11-29 20:16:14,952 INFO hyperopt.tpe: TPE using 0 trials


  5%|▌         | 1/20 [01:15<23:50, 75.28s/trial, best loss: 0.09917254745960236]

2023-11-29 20:17:30,230 INFO hyperopt.tpe: build_posterior_wrapper took 0.000981 seconds
2023-11-29 20:17:30,231 INFO hyperopt.tpe: TPE using 1/1 trials with best loss 0.099173


 10%|█         | 2/20 [02:40<24:21, 81.21s/trial, best loss: 0.09917254745960236]

2023-11-29 20:18:55,599 INFO hyperopt.tpe: build_posterior_wrapper took 0.001200 seconds
2023-11-29 20:18:55,600 INFO hyperopt.tpe: TPE using 2/2 trials with best loss 0.099173


 15%|█▌        | 3/20 [03:40<20:16, 71.57s/trial, best loss: 0.09914801269769669]

2023-11-29 20:19:55,695 INFO hyperopt.tpe: build_posterior_wrapper took 0.001026 seconds
2023-11-29 20:19:55,695 INFO hyperopt.tpe: TPE using 3/3 trials with best loss 0.099148


 20%|██        | 4/20 [04:26<16:19, 61.25s/trial, best loss: 0.09914801269769669]

2023-11-29 20:20:41,112 INFO hyperopt.tpe: build_posterior_wrapper took 0.001049 seconds
2023-11-29 20:20:41,113 INFO hyperopt.tpe: TPE using 4/4 trials with best loss 0.099148


 25%|██▌       | 5/20 [04:53<12:13, 48.88s/trial, best loss: 0.09914801269769669]

2023-11-29 20:21:08,078 INFO hyperopt.tpe: build_posterior_wrapper took 0.001096 seconds
2023-11-29 20:21:08,079 INFO hyperopt.tpe: TPE using 5/5 trials with best loss 0.099148


 30%|███       | 6/20 [05:45<11:40, 50.05s/trial, best loss: 0.09914801269769669]

2023-11-29 20:22:00,388 INFO hyperopt.tpe: build_posterior_wrapper took 0.003616 seconds
2023-11-29 20:22:00,388 INFO hyperopt.tpe: TPE using 6/6 trials with best loss 0.099148


 35%|███▌      | 7/20 [06:14<09:23, 43.33s/trial, best loss: 0.09914801269769669]

2023-11-29 20:22:29,869 INFO hyperopt.tpe: build_posterior_wrapper took 0.001038 seconds
2023-11-29 20:22:29,869 INFO hyperopt.tpe: TPE using 7/7 trials with best loss 0.099148


 40%|████      | 8/20 [06:56<08:33, 42.76s/trial, best loss: 0.09914801269769669]

2023-11-29 20:23:11,424 INFO hyperopt.tpe: build_posterior_wrapper took 0.001090 seconds
2023-11-29 20:23:11,426 INFO hyperopt.tpe: TPE using 8/8 trials with best loss 0.099148


 45%|████▌     | 9/20 [08:34<10:59, 59.96s/trial, best loss: 0.09914801269769669]

2023-11-29 20:24:49,215 INFO hyperopt.tpe: build_posterior_wrapper took 0.001118 seconds
2023-11-29 20:24:49,215 INFO hyperopt.tpe: TPE using 9/9 trials with best loss 0.099148


 50%|█████     | 10/20 [09:05<08:32, 51.20s/trial, best loss: 0.09914801269769669]

2023-11-29 20:25:20,801 INFO hyperopt.tpe: build_posterior_wrapper took 0.003005 seconds
2023-11-29 20:25:20,801 INFO hyperopt.tpe: TPE using 10/10 trials with best loss 0.099148


 55%|█████▌    | 11/20 [10:11<08:21, 55.70s/trial, best loss: 0.09914801269769669]

2023-11-29 20:26:26,692 INFO hyperopt.tpe: build_posterior_wrapper took 0.001004 seconds
2023-11-29 20:26:26,692 INFO hyperopt.tpe: TPE using 11/11 trials with best loss 0.099148


 60%|██████    | 12/20 [11:29<08:18, 62.29s/trial, best loss: 0.09914801269769669]

2023-11-29 20:27:44,063 INFO hyperopt.tpe: build_posterior_wrapper took 0.001171 seconds
2023-11-29 20:27:44,063 INFO hyperopt.tpe: TPE using 12/12 trials with best loss 0.099148


 65%|██████▌   | 13/20 [12:46<07:48, 66.99s/trial, best loss: 0.09912297129631042]

2023-11-29 20:29:01,854 INFO hyperopt.tpe: build_posterior_wrapper took 0.001029 seconds
2023-11-29 20:29:01,855 INFO hyperopt.tpe: TPE using 13/13 trials with best loss 0.099123


 70%|███████   | 14/20 [14:31<07:49, 78.26s/trial, best loss: 0.09911134093999863]

2023-11-29 20:30:46,172 INFO hyperopt.tpe: build_posterior_wrapper took 0.001065 seconds
2023-11-29 20:30:46,172 INFO hyperopt.tpe: TPE using 14/14 trials with best loss 0.099111


 75%|███████▌  | 15/20 [15:17<05:43, 68.76s/trial, best loss: 0.09911134093999863]

2023-11-29 20:31:32,904 INFO hyperopt.tpe: build_posterior_wrapper took 0.001031 seconds
2023-11-29 20:31:32,904 INFO hyperopt.tpe: TPE using 15/15 trials with best loss 0.099111


 80%|████████  | 16/20 [16:44<04:56, 74.07s/trial, best loss: 0.0991022139787674] 

2023-11-29 20:32:59,305 INFO hyperopt.tpe: build_posterior_wrapper took 0.001118 seconds
2023-11-29 20:32:59,306 INFO hyperopt.tpe: TPE using 16/16 trials with best loss 0.099102


 85%|████████▌ | 17/20 [17:31<03:18, 66.11s/trial, best loss: 0.0991022139787674]

2023-11-29 20:33:46,920 INFO hyperopt.tpe: build_posterior_wrapper took 0.001023 seconds
2023-11-29 20:33:46,921 INFO hyperopt.tpe: TPE using 17/17 trials with best loss 0.099102


 90%|█████████ | 18/20 [18:39<02:12, 66.43s/trial, best loss: 0.0991022139787674]

2023-11-29 20:34:54,094 INFO hyperopt.tpe: build_posterior_wrapper took 0.001032 seconds
2023-11-29 20:34:54,095 INFO hyperopt.tpe: TPE using 18/18 trials with best loss 0.099102


 95%|█████████▌| 19/20 [19:39<01:04, 64.62s/trial, best loss: 0.0991022139787674]

2023-11-29 20:35:54,488 INFO hyperopt.tpe: build_posterior_wrapper took 0.001308 seconds
2023-11-29 20:35:54,488 INFO hyperopt.tpe: TPE using 19/19 trials with best loss 0.099102


100%|██████████| 20/20 [20:11<00:00, 60.56s/trial, best loss: 0.0991022139787674]


#### Once training is complete, we can search the current (or previous) experiments

In [13]:
# Search experiments
mlflow.search_experiments()

[<Experiment: artifact_location='file:///notebooks/MLFlow%20Test/mlruns/426164893399017906', creation_time=1701288974927, experiment_id='426164893399017906', last_update_time=1701288974927, lifecycle_stage='active', name='numerai_test_run_0', tags={}>,
 <Experiment: artifact_location='file:///notebooks/MLFlow%20Test/mlruns/0', creation_time=1701288974918, experiment_id='0', last_update_time=1701288974918, lifecycle_stage='active', name='Default', tags={}>]

#### Obtain the model results from an experiment and sort the models by highest Numerai correlation along with their hyperparameters

In [15]:
# Get experiment by name
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id if experiment else None

# Or get experiment by latest run
# current_run = mlflow.active_run()
# experiment_id = current_run.info.experiment_id if current_run else None

# Get top runs
runs = mlflow.search_runs(experiment_ids=[experiment_id])
top_runs = runs.sort_values(by='metrics.numerai_corr', ascending=False).head(10)

print("Top 10 Runs Based on Numerai Corr - MLFlow")
top_runs[['run_id', 'metrics.mse', 'metrics.numerai_corr', 'params.learning_rate',
        'params.colsample_bytree',
       'params.max_depth', 'params.n_estimators',
       'params.subsample']]

Unnamed: 0,run_id,metrics.mse,metrics.numerai_corr,params.learning_rate,params.colsample_bytree,params.max_depth,params.n_estimators,params.subsample
11,7386b3eeac5943a99a2eb12c3ca6b7ca,0.099219,0.05534,0.0182602196846445,0.839006526386211,6.0,979.0,0.9981472462749852
6,d6a82a07c95b4f3c9d6146b220f44d6e,0.099111,0.054204,0.0090133403391199,0.8271417410062577,7.0,877.0,0.932797232868126
4,2c9496f25d2740cc8498b39b7cbc5fd8,0.099102,0.053856,0.0109376233555204,0.8976333077003177,6.0,819.0,0.7619005618077431
7,f1d3468e76c947588916d38375aeba7f,0.099123,0.052449,0.0096306946924212,0.8878067036822037,5.0,846.0,0.7701944350058726
9,79cec66dee70430ab776f4fe067b4f24,0.099149,0.05172,0.0077615888011414,0.8055985236893589,5.0,691.0,0.7937204861797146
17,f5dd17fe7e724bfd9717460db10ebb21,0.099148,0.051633,0.0193571628642451,0.7243890839103431,6.0,550.0,0.7808929451096948
19,0ce23661f54a4a9b977f5b4c1b6bc273,0.099173,0.049865,0.0055621561923038,0.7638649802405694,5.0,839.0,0.7671657167444388
14,df0625065fb54cb1ba813c542e034926,0.099172,0.049091,0.0073217287441847,0.8389710204431613,7.0,384.0,0.7684901694514159
16,d90fecc9b2e642118e5f638b2ad7ba6e,0.09916,0.049018,0.0167128323344858,0.7501022488372513,4.0,513.0,0.9578330136438512
3,a8749b5d10ca4eee9a66c85a6a85d54c,0.09918,0.047387,0.0145076527677063,0.754984021039177,4.0,497.0,0.8336856220929764


### Re-Create Top Model Based on Numerai Correlation
- parameters like `max_depth` and `n_estimators` need to be converted to integers
- be sure to add/remove additional hyperparameters

In [17]:
top_model_params = top_runs.iloc[0]

max_depth = int(float(top_model_params['params.max_depth']))
n_estimators = int(float(top_model_params['params.n_estimators']))
learning_rate = top_model_params['params.learning_rate']
colsample_bytree = top_model_params['params.colsample_bytree']
subsample = top_model_params['params.subsample']

import xgboost as xgb

model = xgb.XGBRegressor(
    learning_rate=learning_rate,
    colsample_bytree=colsample_bytree,
    max_depth=max_depth,
    n_estimators=n_estimators,
    subsample=subsample, 
    tree_method='gpu_hist'
)

### IMPORTANT - SET UP TARGET AND TRAINING FEATURES ###
target = 'target_cyrus_v4_20'
train_features = train_features

model.fit(train_df_pre_cutoff[train_features], train_df_pre_cutoff[target])

prediction_df['prediction'] = model.predict(train_df_post_cutoff[train_features])
mse = mean_squared_error(train_df_post_cutoff[target], prediction_df['prediction'])
corr = numerai_corr(prediction_df['prediction'], train_df_post_cutoff[target])

print("MSE:", mse)
print("Numerai Corr:", corr)

# save model
model.save_model('xgboost_mlflow.json')

MSE: 0.099114954
Numerai Corr: 0.05336213965270191
