In [1]:
# Load libraries
import sys
sys.path.append( "/kaggle/input/my-ariel-library/" )
import ariel_support as ars # general support functions and the data loader
import ariel_gp as arg # The model
import pandas as pd
import numpy as np
import copy
import dill
import IPython

Next, we configure our data loader and model, as it will be used during training. Test set specific tweaks come later. The 'include_later_optimizations' option determines whether we get my original best submission (False) or include later learnings (True).

In [2]:
include_later_optimizations = True

# Configure loader
loader = ars.DataLoader()
loader.loader_options = ars.baseline_loader(include_later_optimization=include_later_optimizations)

# Configure model
model = arg.baseline_model(include_later_optimization=include_later_optimizations)

if include_later_optimizations:
    trained_model_filename = 'trained_model_optimized.pickle'
else:
    trained_model_filename = 'trained_model.pickle'

We're now ready to train our model. By default this is only done if we're not running in the Kaggle environment, but you can alter this in the first line below. If we don't train, we will get our trained model from the earlier stored "trained_model.pickle". 

With the default model, only two parameters are trained: the multiplication applied to all sigma's (trained_model.fudge_value), and the multiplication applied to the mean of every transit prediction (trained_model.model.bias). With the optimized model, the training actually does nothing...

The pickle file for the default model is quite large (~400 MB) because it caches some internal results, which speeds up inference if we do it on the training data later. These caches do not end up being used during inference on test data.

In [3]:
do_training = not ars.running_on_kaggle
untrained_model = copy.deepcopy(model)
if do_training:
    assert len(ars.test_planet_list)==1, "Cannot train when submitted"
    loader.planet_ids_to_load = ars.train_planet_list

    # Load training data and train model on it
    train_data = loader.load()
    model.train(train_data)
    
    pickle_data = dict()        
    pickle_data['untrained_model'] = untrained_model
    pickle_data['trained_model'] = model
    ars.pickle_save(ars.file_loc()+trained_model_filename, pickle_data)
pickle_data = ars.pickle_load(ars.file_loc()+trained_model_filename)
assert dill.dumps(untrained_model) == dill.dumps(pickle_data['untrained_model']), "trained_model.pickle is not consistent with configured model; perhaps you need to redo training above?"
trained_model = pickle_data['trained_model']

If desired we can tweak our model now, to account for differences between the train and test set.

In [4]:
# These values were found by hill climbing the public test set
if not include_later_optimizations:
    trained_model.fudge_value += 0.058
    trained_model.model.bias += -0.0015

Now that we have our trained_model - trained just now or imported - we're ready to apply it to the test data. If we've not actually been submitted, we instead use the first 5 training planets. We can't use the single test planet because some model variations don't work on a single planet.

In [5]:
%%time
# Load data
if len(ars.test_planet_list)==1:
    loader.load_train = True
    loader.planet_ids_to_load = ars.train_planet_list[:5]
else:
    loader.load_train = False
    loader.planet_ids_to_load = ars.test_planet_list    
loader.include_labels = False # don't try to load ground truth
test_data = loader.load()

# Do inference - the unused third output has the covariance matrices per planet
pred,sigma,_ = trained_model.infer(test_data)

CPU times: user 5min 10s, sys: 24.5 s, total: 5min 34s
Wall time: 2min 59s


Finally, we write the output to submission.csv. If we're not in submission mode we write it to the screen.

In [6]:
# Convert to correct output format
submission = pd.read_csv(ars.data_dir() + '/sample_submission.csv')
submission = submission[0:0]
for i in range(len(loader.planet_ids_to_load)):
    submission.loc[i] = np.concatenate(([loader.planet_ids_to_load[i]], pred[i], sigma[i]))
submission_csv=submission.copy().set_index("planet_id")
submission_csv[submission_csv<=0] = 1e-9

# Output
if len(ars.test_planet_list)>1:
    submission_csv.to_csv('submission.csv')
else:    
    IPython.display.display(submission_csv)
    print(submission_csv.to_numpy()[0,5], submission_csv.to_numpy()[0,500], ars.rms(sigma))
    # Expected for testing (with include_later_optimizations=False): 
    # Offline: 0.0011164775705400421 3.989177792434888e-05 3.563359762792441e-05
    # Kaggle:  0.001116477161465638 4.504426029385818e-05 3.567612829731749e-05
    # Expected for testing (with include_later_optimizations=True): 
    # Offline: 0.0011126207347777167 3.664632827623065e-05 3.197817532055578e-05
    # Kaggle:  0.0011126215985547824 4.1359960071418506e-05 3.2244350526727034e-05

Unnamed: 0_level_0,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,wl_10,...,sigma_274,sigma_275,sigma_276,sigma_277,sigma_278,sigma_279,sigma_280,sigma_281,sigma_282,sigma_283
planet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
785834.0,0.001144,0.001113,0.001112,0.001111,0.001111,0.001113,0.001114,0.001117,0.00112,0.001123,...,5e-05,5e-05,5e-05,5.1e-05,5.2e-05,5.3e-05,5.4e-05,5.5e-05,5.5e-05,5.6e-05
14485303.0,0.001871,0.00182,0.001819,0.001818,0.001818,0.001818,0.00182,0.001823,0.001826,0.00183,...,3.4e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.5e-05
17002355.0,0.002794,0.002798,0.002798,0.002798,0.002799,0.002799,0.002798,0.002798,0.002798,0.002797,...,2.8e-05,2.8e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05
24135240.0,0.001358,0.00129,0.001286,0.001283,0.00128,0.001279,0.001279,0.001283,0.001291,0.0013,...,4.4e-05,4.4e-05,4.5e-05,4.5e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05
25070640.0,0.001953,0.001949,0.001949,0.001949,0.001949,0.001949,0.001949,0.001949,0.00195,0.00195,...,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05


0.0011126215985547824 4.1359960071418506e-05 3.2244350526727034e-05
