In [1]:
import pandas as pd
import json
from numerapi import NumerAPI
import gc

In [2]:
napi = NumerAPI()
current_round = napi.get_current_round(tournament=8)
print("Current round: {}".format(current_round))

Current round: 299


In [3]:
print("Downloading dataset files...")
napi.download_dataset("numerai_training_data.parquet", "training_data.parquet")
napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet")
napi.download_dataset("numerai_validation_data.parquet", f"validation_data.parquet")
napi.download_dataset("features.json", "features.json")

Downloading dataset files...


2022-01-19 00:46:23,716 INFO numerapi.utils: target file already exists
2022-01-19 00:46:23,717 INFO numerapi.utils: download complete
2022-01-19 00:46:25,464 INFO numerapi.utils: target file already exists
2022-01-19 00:46:25,465 INFO numerapi.utils: download complete
2022-01-19 00:46:27,085 INFO numerapi.utils: target file already exists
2022-01-19 00:46:27,086 INFO numerapi.utils: download complete
2022-01-19 00:46:28,788 INFO numerapi.utils: target file already exists
2022-01-19 00:46:28,789 INFO numerapi.utils: download complete


In [4]:
ERA_COL = "era"
TARGET_COL = "target_nomi_20"
DATA_TYPE_COL = "data_type"

print('Reading minimal training data')
# read the feature metadata amd get the "small" feature set
with open("features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

print(read_columns)

Reading minimal training data
['feature_agile_unrespited_gaucho', 'feature_antichristian_slangiest_idyllist', 'feature_apomictical_motorized_vaporisation', 'feature_assenting_darn_arthropod', 'feature_beery_somatologic_elimination', 'feature_bhutan_imagism_dolerite', 'feature_branched_dilatory_sunbelt', 'feature_buxom_curtained_sienna', 'feature_cambial_bigoted_bacterioid', 'feature_canalicular_peeling_lilienthal', 'feature_chuffier_analectic_conchiolin', 'feature_communicatory_unrecommended_velure', 'feature_crowning_frustrate_kampala', 'feature_exorbitant_myeloid_crinkle', 'feature_flintier_enslaved_borsch', 'feature_glare_factional_assessment', 'feature_grandmotherly_circumnavigable_homonymity', 'feature_gullable_sanguine_incongruity', 'feature_haziest_lifelike_horseback', 'feature_introvert_symphysial_assegai', 'feature_jerkwater_eustatic_electrocardiograph', 'feature_lofty_acceptable_challenge', 'feature_moralistic_heartier_typhoid', 'feature_planned_superimposed_bend', 'feature_s

In [5]:
# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet('training_data.parquet', columns=read_columns)

training_data

Unnamed: 0_level_0,feature_agile_unrespited_gaucho,feature_antichristian_slangiest_idyllist,feature_apomictical_motorized_vaporisation,feature_assenting_darn_arthropod,feature_beery_somatologic_elimination,feature_bhutan_imagism_dolerite,feature_branched_dilatory_sunbelt,feature_buxom_curtained_sienna,feature_cambial_bigoted_bacterioid,feature_canalicular_peeling_lilienthal,...,feature_undivorced_unsatisfying_praetorium,feature_unforbidden_highbrow_kafir,feature_univalve_abdicant_distrail,feature_unsealed_suffixal_babar,feature_unvaried_social_bangkok,feature_unwonted_trusted_fixative,feature_winsome_irreproachable_milkfish,era,data_type,target_nomi_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0.75,1.00,0.50,0.25,1.00,0.00,0.50,0.50,0.00,0.75,...,0.50,0.00,0.00,0.50,0.25,1.00,0.75,0001,train,0.25
n003bee128c2fcfc,0.50,0.25,0.50,0.75,0.50,0.25,0.75,0.75,0.25,0.25,...,0.75,0.25,0.75,0.50,1.00,0.25,0.25,0001,train,0.75
n0048ac83aff7194,0.50,0.75,1.00,0.75,0.75,1.00,0.75,0.75,0.75,0.75,...,1.00,1.00,0.00,0.75,1.00,0.75,0.25,0001,train,0.50
n00691bec80d3e02,0.50,0.25,0.50,0.50,0.00,0.75,0.00,0.00,0.50,0.50,...,0.00,0.75,0.00,0.25,0.00,0.00,0.00,0001,train,0.75
n00b8720a2fdc4f2,0.00,1.00,0.25,1.00,0.00,0.00,0.25,0.25,0.75,0.25,...,0.50,0.00,0.00,0.00,0.50,0.50,0.00,0001,train,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcc1dbdf2212e6,1.00,1.00,1.00,0.50,1.00,1.00,1.00,1.00,0.50,1.00,...,0.75,1.00,0.50,0.75,0.75,0.75,0.25,0574,train,0.75
nffd71b7f6a128df,0.00,0.50,0.25,0.25,1.00,0.50,0.00,0.25,0.75,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0574,train,0.00
nffde3b371d67394,0.75,1.00,1.00,1.00,0.00,1.00,1.00,1.00,0.50,1.00,...,0.00,0.25,0.50,0.75,0.00,0.75,0.50,0574,train,0.25
nfff1a1111b35e84,0.50,0.00,0.00,1.00,0.50,0.25,1.00,1.00,0.25,0.00,...,0.25,0.00,0.25,0.50,0.25,1.00,0.25,0574,train,0.50


In [6]:
gc.collect()

print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('validation_data.parquet',
                                  columns=read_columns)
tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet',
                                  columns=read_columns)
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()

# check for nans and fill nans
if nans_per_col.any():
    total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

Reading minimal features of validation and tournament data...
Number of nans per column this week: target_nomi_20    5350
dtype: int64
out of 5350 total rows
filling nans with 0.5


In [10]:
validation_data

Unnamed: 0_level_0,feature_agile_unrespited_gaucho,feature_antichristian_slangiest_idyllist,feature_apomictical_motorized_vaporisation,feature_assenting_darn_arthropod,feature_beery_somatologic_elimination,feature_bhutan_imagism_dolerite,feature_branched_dilatory_sunbelt,feature_buxom_curtained_sienna,feature_cambial_bigoted_bacterioid,feature_canalicular_peeling_lilienthal,...,feature_undivorced_unsatisfying_praetorium,feature_unforbidden_highbrow_kafir,feature_univalve_abdicant_distrail,feature_unsealed_suffixal_babar,feature_unvaried_social_bangkok,feature_unwonted_trusted_fixative,feature_winsome_irreproachable_milkfish,era,data_type,target_nomi_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000777698096000,0.00,0.75,0.25,0.00,0.50,0.25,0.00,0.00,0.00,0.50,...,0.50,0.50,0.75,0.00,0.50,0.50,1.00,0857,validation,0.25
n0009793a3b91c27,1.00,0.75,0.50,0.25,0.25,0.50,0.75,0.75,0.50,1.00,...,0.25,0.25,0.75,1.00,0.50,0.50,0.50,0857,validation,0.50
n00099ccd6698ab0,0.25,1.00,1.00,0.50,0.75,1.00,1.00,1.00,0.25,1.00,...,0.25,1.00,0.00,0.25,0.25,1.00,0.25,0857,validation,0.00
n0019e36bbb8702b,0.25,0.00,0.25,1.00,0.00,0.25,0.25,0.50,0.50,0.00,...,0.25,0.25,0.50,0.00,0.00,0.00,0.50,0857,validation,0.50
n0028cb874439df8,0.50,0.75,0.75,0.50,0.50,0.50,0.75,0.75,1.00,1.00,...,1.00,0.25,0.25,0.50,1.00,0.75,0.00,0857,validation,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffbe5152c321f92,0.00,0.50,0.00,0.50,0.75,0.00,0.00,0.00,1.00,0.25,...,0.00,0.50,0.00,0.00,0.00,0.00,0.50,0961,validation,0.50
nffc011b4baa54c3,0.25,0.75,0.50,0.50,0.00,1.00,0.75,0.25,0.50,1.00,...,1.00,0.75,0.75,0.25,1.00,0.25,0.50,0961,validation,0.50
nffc12b2a846ab4e,1.00,1.00,0.50,0.00,0.00,1.00,0.75,0.75,1.00,1.00,...,0.75,1.00,1.00,0.75,0.75,0.50,0.75,0961,validation,0.00
nffc3c5ab0235de0,1.00,1.00,0.25,1.00,0.75,1.00,0.75,0.75,0.50,0.25,...,1.00,1.00,0.75,1.00,1.00,1.00,1.00,0961,validation,0.50
