In [3]:
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

In [13]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import mlflow
import predict as predict_script
import deploy_model as dm

import sys, os
sys.path.append(os.path.abspath('..'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import utils as ut

In [15]:
import numerapi
napi = numerapi.NumerAPI(verbosity="info")

In [16]:
import logging
logging.basicConfig(level=logging.INFO)

In [17]:
import os


TEST = True
TMP_DOWNLOADS_DIR = "./downloads/"
MODELS_DIR = "./models/"
AWS_CREDS_FL = "~/.aws/personal_credentials"
AWS_CREDS_LL_FL = "~/.aws/credentials"
RUN_COLS = ["run_id", "experiment_id", "artifact_uri", "metrics.sharpe", "params.target", "params.lgbm_params.num_leaves"]
EXPERIMENT_NAME = "all_data_v2"

os.makedirs(TMP_DOWNLOADS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

## 1. Sunshine model trained on all data

In [7]:
SUNSHINE_MODEL_NAME = "albania"

### 1.1 Download the model

In [None]:
mlflow.set_tracking_uri("http://18.218.213.146:5500")

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id

In [None]:
run_df = mlflow.search_runs(experiment_id)
run_df.columns

In [None]:
run_df[RUN_COLS]

In [None]:
ut.download_from_s3_recursively(
    s3_path=f"s3://numerai-v1/mlflow/{experiment_id}/",
    local_path=TMP_DOWNLOADS_DIR,
    aws_credential_fl=AWS_CREDS_FL,
    dry_run=True,
    flname="model.pkl",
)

In [None]:
if not TEST:
    ut.download_from_s3_recursively(
        s3_path=f"s3://numerai-v1/mlflow/{experiment_id}/",
        local_path=TMP_DOWNLOADS_DIR,
        aws_credential_fl=AWS_CREDS_FL,
        flname="model.pkl",
    )

In [None]:
def extract_target_name(model_path, suffix):
    # ./downloads/d2414f840f844f68a08b1650087cbf8b/artifacts/all_data_traintest_target_tyler_v4_20_all_data_ff2c9e/model.pkl -> tyler_v4_20
    return model_path[model_path.find("target")+7:].split("/")[0][:-len(suffix)-7]

extract_target_name("./downloads/d2414f840f844f68a08b1650087cbf8b/artifacts/all_data_traintest_target_tyler_v4_20_all_data_ff2c9e/model.pkl", suffix="_all_data")

In [None]:
# load the models up into a dict {"model_{target_name}": model} where target name
# is the name is extracted from the model's path.
# ./downloads/d2414f840f844f68a08b1650087cbf8b/artifacts/all_data_traintest_target_tyler_v4_20_all_data_ff2c9e/model.pkl -> tyler_v4_20
# Iterate over all files in this folder recursively
models = {}
for root, dirs, files in os.walk(TMP_DOWNLOADS_DIR):
    for file in files:
        if file != "model.pkl":
            continue
        model_path = os.path.join(root, file)
        print(model_path)
        target_name = extract_target_name(model_path, suffix="_all_data")
        models[f"model_{target_name}"] = ut.unpickle_obj(model_path)

### 1.2 Create the ensemble model and test it

In [None]:
ensemble_model = dm.MultiTargetNeutralModel(models=models)

In [None]:
predict_script.predict(napi=napi, wrapped_model=ensemble_model)

In [None]:
ss_fl = os.path.join(MODELS_DIR, SUNSHINE_MODEL_NAME+".pkl")
ut.pickle_obj(fl=ss_fl, obj=ensemble_model)
print(ss_fl)

In [None]:
ut.upload_to_s3_recursively(
    s3_path="s3://numerai-v1/deployed_models/",
    dir_path="./models/",
    aws_credential_fl=AWS_CREDS_FL,
    dry_run=False,
)

## 2. ArgentinaCV: model trained to optimise cyrus

Uses models trained on several temporal splits of the data. Average each target model's prediction. And combine the predictions of the models.
1. `cyrus_20` - wt 0.7
1. `ben_20` - 0.06
1. `waldo_20` - 0.06
1. `tyler_20` - 0.06
1. `victor_20` - 0.06
1. `nomi_20` - 0.06

In [8]:
from os.path import join

#### WARNING: THE PATH BELOW IS FOR A LAMBDALABS MC

In [9]:
PATH = "/home/ubuntu/"

In [10]:
cv_models_map = {}
for i, expt_nm in enumerate([
    "cv_tgts_vs_cyrus_2023-04-25_20h-20m",
    "cv_tgts_vs_cyrus_2023-04-25_22h-04m",
    "cv_tgts_vs_cyrus_2023-04-25_23h-15m",
]):
    local_fl = join(PATH, f"cv_models_metrics_{i}.pkl")
    ut.download_s3_file(
        s3_path=f"s3://numerai-v1/experiments/{expt_nm}/uploads/cv_models_metrics.pkl",
        local_path=local_fl,
        aws_credential_fl='~/.aws/credentials',
        aws_profile= 'default',
        dry_run=False,
    )
    cv_models_map.update(
        **ut.unpickle_obj(local_fl)
    )

[2023-05-02 04:05:47]  Loading aws credenitals from ~/.aws/credentials...
[2023-05-02 04:05:47]  Would have downloaded s3://numerai-v1/experiments/cv_tgts_vs_cyrus_2023-04-25_20h-20m/uploads/cv_models_metrics.pkl to /home/ubuntu/cv_models_metrics_0.pkl. But /home/ubuntu/cv_models_metrics_0.pkl exists. Will not download again ...
[2023-05-02 04:05:55]  Loading aws credenitals from ~/.aws/credentials...
[2023-05-02 04:05:55]  Would have downloaded s3://numerai-v1/experiments/cv_tgts_vs_cyrus_2023-04-25_22h-04m/uploads/cv_models_metrics.pkl to /home/ubuntu/cv_models_metrics_1.pkl. But /home/ubuntu/cv_models_metrics_1.pkl exists. Will not download again ...
[2023-05-02 04:06:03]  Loading aws credenitals from ~/.aws/credentials...
[2023-05-02 04:06:03]  Would have downloaded s3://numerai-v1/experiments/cv_tgts_vs_cyrus_2023-04-25_23h-15m/uploads/cv_models_metrics.pkl to /home/ubuntu/cv_models_metrics_2.pkl. But /home/ubuntu/cv_models_metrics_2.pkl exists. Will not download again ...


In [11]:
print(list(cv_models_map))

['target_nomi_v4_20', 'target_nomi_v4_60', 'target_tyler_v4_20', 'target_tyler_v4_60', 'target_victor_v4_20', 'target_victor_v4_60', 'target_ralph_v4_20', 'target_ralph_v4_60', 'target_waldo_v4_20', 'target_waldo_v4_60', 'target_jerome_v4_20', 'target_jerome_v4_60', 'target_janet_v4_20', 'target_janet_v4_60', 'target_ben_v4_20', 'target_ben_v4_60', 'target_alan_v4_20', 'target_alan_v4_60', 'target_paul_v4_20', 'target_paul_v4_60', 'target_george_v4_20', 'target_george_v4_60', 'target_william_v4_20', 'target_william_v4_60', 'target_arthur_v4_20', 'target_arthur_v4_60', 'target_thomas_v4_20', 'target_thomas_v4_60', 'target_cyrus_v4_20', 'target_cyrus_v4_60', 'target_caroline_v4_20', 'target_caroline_v4_60', 'target_sam_v4_20', 'target_sam_v4_60', 'target_xerxes_v4_20', 'target_xerxes_v4_60']


In [12]:
argentina_targets = [
    "target_cyrus_v4_20",
    "target_nomi_v4_20",
    "target_waldo_v4_20",
    "target_ben_v4_20",
    "target_victor_v4_20",
    "target_tyler_v4_20",
]
argentina_cv_models = {
    f"{tgt}_cv{cv}": mdl
    for tgt in argentina_targets
    for cv, mdl in enumerate(cv_models_map[tgt]["models"])
}

In [13]:
argentina_cv_models.keys()

dict_keys(['target_cyrus_v4_20_cv0', 'target_cyrus_v4_20_cv1', 'target_cyrus_v4_20_cv2', 'target_nomi_v4_20_cv0', 'target_nomi_v4_20_cv1', 'target_nomi_v4_20_cv2', 'target_waldo_v4_20_cv0', 'target_waldo_v4_20_cv1', 'target_waldo_v4_20_cv2', 'target_ben_v4_20_cv0', 'target_ben_v4_20_cv1', 'target_ben_v4_20_cv2', 'target_victor_v4_20_cv0', 'target_victor_v4_20_cv1', 'target_victor_v4_20_cv2', 'target_tyler_v4_20_cv0', 'target_tyler_v4_20_cv1', 'target_tyler_v4_20_cv2'])

In [15]:
from predict import argentina_ensemble

In [16]:
argentinacv_no_neutralisation = dm.MultiTargetNeutralModel(
    models=argentina_cv_models,
    neutralisation_cols=None,
    neutralisation_prop=None,
    ensembling_fn=argentina_ensemble,
)
pred_argcv_nontr_df = predict_script.predict(napi=napi, wrapped_model=argentinacv_no_neutralisation)
display(pred_argcv_nontr_df)
display(pred_argcv_nontr_df.describe())

[2023-05-02 04:07:10,377] INFO - reading prediction data
[2023-05-02 04:07:10,781] INFO - target file already exists
[2023-05-02 04:07:10,782] INFO - download complete
[2023-05-02 04:07:10,783] INFO - Downloaded live data to v4.1/live_474.parquet...
[2023-05-02 04:07:10,945] INFO - generating predictions


Predicting for each model: 100%|██████████| 18/18 [00:09<00:00,  2.00it/s]

[2023-05-02 04:07:19,986] INFO - Ensembling predictions with argentina_ensemble(): ['pred_target_cyrus_v4_20_cv0', 'pred_target_cyrus_v4_20_cv1', 'pred_target_cyrus_v4_20_cv2', 'pred_target_nomi_v4_20_cv0', 'pred_target_nomi_v4_20_cv1', 'pred_target_nomi_v4_20_cv2', 'pred_target_waldo_v4_20_cv0', 'pred_target_waldo_v4_20_cv1', 'pred_target_waldo_v4_20_cv2', 'pred_target_ben_v4_20_cv0', 'pred_target_ben_v4_20_cv1', 'pred_target_ben_v4_20_cv2', 'pred_target_victor_v4_20_cv0', 'pred_target_victor_v4_20_cv1', 'pred_target_victor_v4_20_cv2', 'pred_target_tyler_v4_20_cv0', 'pred_target_tyler_v4_20_cv1', 'pred_target_tyler_v4_20_cv2']
[2023-05-02 04:07:19,993] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n001d4bff193beba,0.318530
n001dcc72fca06e0,0.611425
n00244b892d7c135,0.408155
n003333964d4a093,0.154824
n00353bdc5018f63,0.829633
...,...
nffc9d6ca15bd9e8,0.120711
nffcdc07844390ea,0.032499
nffd7c7d4bbe2065,0.156036
nffe5e4e5af3c3dd,0.014534


Unnamed: 0,prediction
count,4954.0
mean,0.500101
std,0.288704
min,0.000202
25%,0.250151
50%,0.500101
75%,0.75005
max,1.0


In [17]:
argentinacv_ntr_50p = dm.MultiTargetNeutralModel(
    models=argentina_cv_models,
    neutralisation_cols="all",
    neutralisation_prop=0.5,
    ensembling_fn=argentina_ensemble,
)
pred_argcv_ntr50_df = predict_script.predict(napi=napi, wrapped_model=argentinacv_ntr_50p)
display(pred_argcv_ntr50_df)
display(pred_argcv_ntr50_df.describe())

[2023-05-02 04:07:20,019] INFO - reading prediction data
[2023-05-02 04:07:20,375] INFO - target file already exists
[2023-05-02 04:07:20,376] INFO - download complete
[2023-05-02 04:07:20,379] INFO - Downloaded live data to v4.1/live_474.parquet...
[2023-05-02 04:07:20,492] INFO - generating predictions


Predicting for each model: 100%|██████████| 18/18 [00:08<00:00,  2.01it/s]

[2023-05-02 04:07:29,450] INFO - Ensembling predictions with argentina_ensemble(): ['pred_target_cyrus_v4_20_cv0', 'pred_target_cyrus_v4_20_cv1', 'pred_target_cyrus_v4_20_cv2', 'pred_target_nomi_v4_20_cv0', 'pred_target_nomi_v4_20_cv1', 'pred_target_nomi_v4_20_cv2', 'pred_target_waldo_v4_20_cv0', 'pred_target_waldo_v4_20_cv1', 'pred_target_waldo_v4_20_cv2', 'pred_target_ben_v4_20_cv0', 'pred_target_ben_v4_20_cv1', 'pred_target_ben_v4_20_cv2', 'pred_target_victor_v4_20_cv0', 'pred_target_victor_v4_20_cv1', 'pred_target_victor_v4_20_cv2', 'pred_target_tyler_v4_20_cv0', 'pred_target_tyler_v4_20_cv1', 'pred_target_tyler_v4_20_cv2']
[2023-05-02 04:07:29,454] INFO - Neutralising the predictions



100%|██████████| 1/1 [00:08<00:00,  8.61s/it]

[2023-05-02 04:07:38,071] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n001d4bff193beba,0.293904
n001dcc72fca06e0,0.405531
n00244b892d7c135,0.426524
n003333964d4a093,0.067824
n00353bdc5018f63,0.861324
...,...
nffc9d6ca15bd9e8,0.138474
nffcdc07844390ea,0.036940
nffd7c7d4bbe2065,0.085386
nffe5e4e5af3c3dd,0.012919


Unnamed: 0,prediction
count,4954.0
mean,0.500101
std,0.288704
min,0.000202
25%,0.250151
50%,0.500101
75%,0.75005
max,1.0


#### Does neutralisation do anything?

In [18]:
(pred_argcv_ntr50_df==pred_argcv_nontr_df).mean()

prediction    0.000807
dtype: float64

### Pickle and save the neutralised and non neutralised models

In [19]:
argentinacv_nontr_fl = os.path.join(MODELS_DIR, "argentinacv_no_ntr.pkl")
ut.pickle_obj(fl=argentinacv_nontr_fl, obj=argentinacv_no_neutralisation)
print(argentinacv_nontr_fl)

argentinacv_ntr50p_fl = os.path.join(MODELS_DIR, "argentinacv_ntr50p.pkl")
ut.pickle_obj(fl=argentinacv_ntr50p_fl, obj=argentinacv_ntr_50p)
print(argentinacv_nontr_fl)

./models/argentinacv_no_ntr.pkl
./models/argentinacv_no_ntr.pkl


In [20]:
ut.upload_to_s3_recursively(
    s3_path="s3://numerai-v1/deployed_models/",
    dir_path="./models/",
    aws_credential_fl=AWS_CREDS_LL_FL,
    dry_run=False,
)

[2023-05-02 04:08:02]  Loading aws credenitals from ~/.aws/credentials...
[2023-05-02 04:08:02]  Uploading ./models/argentinacv_no_ntr.pkl to s3://numerai-v1/deployed_models/argentinacv_no_ntr.pkl
[2023-05-02 04:08:19]  Uploading ./models/argentinacv_ntr50p.pkl to s3://numerai-v1/deployed_models/argentinacv_ntr50p.pkl
[2023-05-02 04:08:37]  Uploading ./models/argentina_no_ntr.pkl to s3://numerai-v1/deployed_models/argentina_no_ntr.pkl


## 3. Argentina: model trained to optimise cyrus

Uses models trained on the entire daaset. Combine predictions with
1. `cyrus_20` - wt 0.7
1. `ben_20` - 0.06
1. `waldo_20` - 0.06
1. `tyler_20` - 0.06
1. `victor_20` - 0.06
1. `nomi_20` - 0.06

In [8]:
!aws s3 cp --recursive s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/ ./

download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_cyrus_v4_20.pkl.pkl to ./model_cyrus_v4_20.pkl.pkl
download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_nomi_v4_20.pkl.pkl to ./model_nomi_v4_20.pkl.pkl
download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_victor_v4_20.pkl.pkl to ./model_victor_v4_20.pkl.pkl
download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_ben_v4_20.pkl.pkl to ./model_ben_v4_20.pkl.pkl
download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_tyler_v4_20.pkl.pkl to ./model_tyler_v4_20.pkl.pkl
download: s3://numerai-v1/experiments/train_on_all_data_2023-05-01_16h-10m/models/model_waldo_v4_20.pkl.pkl to ./model_waldo_v4_20.pkl.pkl


In [21]:
argn_mdls = {
    tgt: ut.unpickle_obj(fl=f"./model_{tgt}_v4_20.pkl.pkl")
    for tgt in ["cyrus", "tyler", "ben", "waldo", "victor", "nomi"]
}

In [25]:
argentina_no_ntr = dm.MultiTargetNeutralModel(
    models=argn_mdls,
    neutralisation_cols=None,
    neutralisation_prop=None,
    ensembling_fn=argentina_ensemble,
)
pred_arg_nontr_df = predict_script.predict(napi=napi, wrapped_model=argentina_no_ntr)
display(pred_arg_nontr_df)
display(pred_arg_nontr_df.describe())

[2023-05-02 04:09:22,666] INFO - reading prediction data
[2023-05-02 04:09:23,026] INFO - target file already exists
[2023-05-02 04:09:23,027] INFO - download complete
[2023-05-02 04:09:23,030] INFO - Downloaded live data to v4.1/live_474.parquet...
[2023-05-02 04:09:23,174] INFO - generating predictions


Predicting for each model: 100%|██████████| 6/6 [00:07<00:00,  1.25s/it]

[2023-05-02 04:09:30,670] INFO - Ensembling predictions with argentina_ensemble(): ['pred_cyrus', 'pred_tyler', 'pred_ben', 'pred_waldo', 'pred_victor', 'pred_nomi']
[2023-05-02 04:09:30,674] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n001d4bff193beba,0.068228
n001dcc72fca06e0,0.265644
n00244b892d7c135,0.383932
n003333964d4a093,0.620509
n00353bdc5018f63,0.922285
...,...
nffc9d6ca15bd9e8,0.158660
nffcdc07844390ea,0.107590
nffd7c7d4bbe2065,0.054905
nffe5e4e5af3c3dd,0.435204


Unnamed: 0,prediction
count,4954.0
mean,0.500101
std,0.288704
min,0.000202
25%,0.250151
50%,0.500101
75%,0.75005
max,1.0


In [27]:
argentina_ntr_50p = dm.MultiTargetNeutralModel(
    models=argn_mdls,
    neutralisation_cols="all",
    neutralisation_prop=0.5,
    ensembling_fn=argentina_ensemble,
)
pred_arg_ntr50_df = predict_script.predict(napi=napi, wrapped_model=argentina_ntr_50p)
display(pred_arg_ntr50_df)
display(pred_arg_ntr50_df.describe())

[2023-05-02 04:09:41,357] INFO - reading prediction data
[2023-05-02 04:09:41,771] INFO - target file already exists
[2023-05-02 04:09:41,778] INFO - download complete
[2023-05-02 04:09:41,780] INFO - Downloaded live data to v4.1/live_474.parquet...
[2023-05-02 04:09:41,922] INFO - generating predictions


Predicting for each model: 100%|██████████| 6/6 [00:07<00:00,  1.26s/it]

[2023-05-02 04:09:49,488] INFO - Ensembling predictions with argentina_ensemble(): ['pred_cyrus', 'pred_tyler', 'pred_ben', 'pred_waldo', 'pred_victor', 'pred_nomi']
[2023-05-02 04:09:49,492] INFO - Neutralising the predictions



100%|██████████| 1/1 [00:09<00:00,  9.64s/it]

[2023-05-02 04:09:59,135] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n001d4bff193beba,0.106782
n001dcc72fca06e0,0.239806
n00244b892d7c135,0.462858
n003333964d4a093,0.473960
n00353bdc5018f63,0.811869
...,...
nffc9d6ca15bd9e8,0.177432
nffcdc07844390ea,0.108599
nffd7c7d4bbe2065,0.045418
nffe5e4e5af3c3dd,0.348203


Unnamed: 0,prediction
count,4954.0
mean,0.500101
std,0.288704
min,0.000202
25%,0.250151
50%,0.500101
75%,0.75005
max,1.0


In [28]:
(pred_arg_ntr50_df==pred_arg_nontr_df).mean()

prediction    0.00222
dtype: float64

In [30]:
argentina_nontr_fl = os.path.join(MODELS_DIR, "argentina_no_ntr.pkl")
ut.pickle_obj(fl=argentina_nontr_fl, obj=argentina_no_ntr)
print(argentina_nontr_fl)

argentina_ntr50p_fl = os.path.join(MODELS_DIR, "argentina_ntr50p.pkl")
ut.pickle_obj(fl=argentina_ntr50p_fl, obj=argentina_ntr_50p)
print(argentina_nontr_fl)

./models/argentina_no_ntr.pkl
./models/argentina_no_ntr.pkl


In [32]:
!aws s3 sync ./models/  s3://numerai-v1/deployed_models/

upload: models/argentina_no_ntr.pkl to s3://numerai-v1/deployed_models/argentina_no_ntr.pkl
upload: models/argentina_ntr50p.pkl to s3://numerai-v1/deployed_models/argentina_ntr50p.pkl


### Predicting

In [33]:
import pandas as pd
import pickle
import numerapi
import numpy as np

In [34]:
napi = numerapi.NumerAPI()

In [35]:
model = pickle.load(open("./models/argentina_no_ntr.pkl", "rb"))

In [36]:
!aws s3 cp --profile hfprod s3://numerai-data-gen/v3-staging/tournament/v4.1/encrypted/live_int8/20230525.parquet ./

download: s3://numerai-data-gen/v3-staging/tournament/v4.1/encrypted/live_int8/20230525.parquet to ./20230525.parquet


In [37]:
live_data = pd.read_parquet("./20230525.parquet")

In [38]:
feats = [c for c in live_data.columns if c.startswith("feature")]
live_data[feats] = live_data[feats].astype(np.int8)

In [39]:
df = model.predict(live_data)

Predicting for each model: 100%|███████| 6/6 [00:21<00:00,  3.64s/it]
2023-05-28 15:49:06,898 INFO deploy_model: Ensembling predictions with argentina_ensemble(): ['pred_cyrus', 'pred_tyler', 'pred_ben', 'pred_waldo', 'pred_victor', 'pred_nomi']
2023-05-28 15:49:06,904 INFO deploy_model: Taking the rank percent


In [40]:
df

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n000c055120eb41e,0.680842
n00270efaea14e80,0.164439
n0028328b39c018b,0.318145
n002d212adf91a75,0.964358
n003790fb6e38334,0.531794
...,...
nff96f4d02aab0ba,0.296476
nffa9ae4ab3e7db0,0.594573
nffc43f16748182f,0.875253
nfff1bb4e5ba9473,0.122114


In [41]:
print(df)

                  prediction
id                          
n000c055120eb41e    0.680842
n00270efaea14e80    0.164439
n0028328b39c018b    0.318145
n002d212adf91a75    0.964358
n003790fb6e38334    0.531794
...                      ...
nff96f4d02aab0ba    0.296476
nffa9ae4ab3e7db0    0.594573
nffc43f16748182f    0.875253
nfff1bb4e5ba9473    0.122114
nfff4b7214e4f4ba    0.848522

[4938 rows x 1 columns]


## Optional Sanity checks

In [84]:
import numerapi
from predict import argentina_ensemble
napi = numerapi.NumerAPI()

rnd = napi.get_current_round()
rnd

502

In [70]:
live_fl = f"v4.1/live_{rnd}.parquet"
napi.download_dataset("v4.1/live_int8.parquet", live_fl)

[2023-06-08 13:32:33,894] INFO - target file already exists
[2023-06-08 13:32:33,897] INFO - download complete


'v4.1/live_502.parquet'

In [45]:
def fmt_features(df, features, int8: bool, impute: bool):
    if impute:
        df.loc[:, features] = df.loc[:, features].fillna(
            df[features].median(skipna=True)
        )
    df["era"] = df["era"].astype(int)
    df.loc[:, features] = df.loc[:, features].astype(np.int8 if int8 else np.float32)
    return df

In [28]:
argn_mdls = {
    tgt: ut.unpickle_obj(fl=f"./model_{tgt}_v4_20.pkl.pkl")
    for tgt in ["cyrus", "tyler", "ben", "waldo", "victor", "nomi"]
}

In [77]:
features = argn_mdls["cyrus"].feature_name_
full_df = pd.read_parquet(
    live_fl,
    columns=features + ["era"],
)
full_df["era"] = rnd
df = fmt_features(
    df=full_df,
    features=features,
    int8=True,
    impute=True,
)
df_feat = df[features].astype(np.int8)

In [79]:
df_feat.head(2)

Unnamed: 0_level_0,feature_honoured_observational_balaamite,feature_polaroid_vadose_quinze,feature_untidy_withdrawn_bargeman,feature_genuine_kyphotic_trehala,feature_unenthralled_sportful_schoolhouse,feature_divulsive_explanatory_ideologue,feature_ichthyotic_roofed_yeshiva,feature_waggly_outlandish_carbonisation,feature_floriated_amish_sprite,feature_iconoclastic_parietal_agonist,...,feature_aged_phylacterical_pusey,feature_revisional_ablutionary_depression,feature_yokelish_metapsychological_lunt,feature_circumlunar_chaliced_seam,feature_squallier_prototypal_dammar,feature_cognate_elating_ravine,feature_ethiopian_carminative_retentivity,feature_alabamian_outlying_monitoring,feature_byzantine_festinate_mannose,feature_sleetier_sea_potamogeton
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000124edbee5931,3,2,0,4,0,3,1,0,2,2,...,0,0,0,0,0,0,3,3,0,0
n0006fd05e5c5171,4,4,4,4,0,4,4,1,2,0,...,4,4,4,4,4,4,4,4,4,4


In [80]:
print("Unwrapped cyrus")
pd.Series(argn_mdls["cyrus"].predict(df_feat)).rank(pct=True).to_frame()

Unwrapped cyrus


Unnamed: 0,0
0,0.840941
1,0.854737
2,0.102658
3,0.095354
4,0.730980
...,...
4924,0.936093
4925,0.480219
4926,0.683912
4927,0.726922


In [52]:
wrapped_cyrus = dm.MultiTargetNeutralModel(
    models={"cyrus": argn_mdls["cyrus"]},
    neutralisation_cols=None,
    neutralisation_prop=None,
)

In [81]:
lcl_cyrus_pred = wrapped_cyrus.predict(df_feat)
lcl_cyrus_pred

Predicting for each model: 100%|████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.94s/it]

[2023-06-08 13:35:29,510] INFO - Ensembling predictions with mean(): ['pred_cyrus']
[2023-06-08 13:35:29,515] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n000124edbee5931,0.840941
n0006fd05e5c5171,0.854737
n0008241720e02e0,0.102658
n002cecd72ff9453,0.095354
n0033b5ad4d2f9a0,0.730980
...,...
nffcf4bdcf971590,0.936093
nffd07e017f3def4,0.480219
nfff296ce15d1d13,0.683912
nfff505ecc1ec6ad,0.726922


In [85]:
wrapped_argn = dm.MultiTargetNeutralModel(
    models=argn_mdls,
    neutralisation_cols=None,
    neutralisation_prop=None,
    ensembling_fn=argentina_ensemble,
)
lcl_argn_pred = wrapped_argn.predict(df_feat)
lcl_argn_pred

Predicting for each model: 100%|████████████████████████████████████████████████████| 6/6 [00:30<00:00,  5.01s/it]

[2023-06-08 13:41:47,636] INFO - Ensembling predictions with argentina_ensemble(): ['pred_cyrus', 'pred_tyler', 'pred_ben', 'pred_waldo', 'pred_victor', 'pred_nomi']
[2023-06-08 13:41:47,649] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n000124edbee5931,0.840941
n0006fd05e5c5171,0.854737
n0008241720e02e0,0.102658
n002cecd72ff9453,0.095354
n0033b5ad4d2f9a0,0.730980
...,...
nffcf4bdcf971590,0.936093
nffd07e017f3def4,0.480219
nfff296ce15d1d13,0.683912
nfff505ecc1ec6ad,0.726922


In [86]:
lcl_cyrus_pred.equals(lcl_argn_pred)

True

In [87]:
predict_script.predict(napi=napi, wrapped_model=wrapped_argn)

[2023-06-08 13:44:13,425] INFO - reading prediction data
[2023-06-08 13:44:14,545] INFO - target file already exists
[2023-06-08 13:44:14,546] INFO - download complete
[2023-06-08 13:44:14,549] INFO - Downloaded live data to v4.1/live_502.parquet...
[2023-06-08 13:44:14,803] INFO - generating predictions


Predicting for each model: 100%|████████████████████████████████████████████████████| 6/6 [00:27<00:00,  4.53s/it]

[2023-06-08 13:44:42,008] INFO - Ensembling predictions with argentina_ensemble(): ['pred_cyrus', 'pred_tyler', 'pred_ben', 'pred_waldo', 'pred_victor', 'pred_nomi']
[2023-06-08 13:44:42,020] INFO - Taking the rank percent





Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n000124edbee5931,0.326638
n0006fd05e5c5171,0.900183
n0008241720e02e0,0.485291
n002cecd72ff9453,0.500304
n0033b5ad4d2f9a0,0.695882
...,...
nffcf4bdcf971590,0.727937
nffd07e017f3def4,0.434368
nfff296ce15d1d13,0.061676
nfff505ecc1ec6ad,0.132887
