# Ariel data challenge model training

## 1. Notebook set up

In [1]:
# Standard library imports
import os
import random
import time
from pathlib import Path

# Silence tensorflow, except for errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Third party imports
import kagglehub
import tensorflow as tf

# Project imports
from ariel_data_preprocessing.data_preprocessing import DataProcessor

# Globals
REBUILD_DATA = False
WAVELENGTHS = 283 # Number of wavelength indicies
SAMPLE_SIZE = 883 # Number of captures per sample
SAMPLES = 10      # Number of sample to draw per planet
N_CPUS = 3
N_PLANETS = -1

2025-09-21 23:58:33.691836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758499113.717543     203 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758499113.724354     203 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 2. Data preparation¶
### 2.1. Preprocess raw data

In [2]:
if Path('/kaggle/input/ariel-training-data/train.h5').is_file() and REBUILD_DATA is False:
    input_data_path = '/kaggle/input/ariel-training-data'
    print('Using training data from Kaggle datasets')

elif REBUILD_DATA is True:

    # Run the data preprocessing pipeline on the competition data
    start_time = time.time()
    data_processor = DataProcessor(
        input_data_path='/kaggle/input/ariel-data-challenge-2025',
        output_data_path='/kaggle/working',
        n_cpus=N_CPUS,
        n_planets=N_PLANETS,
        mode='train'
    )
    
    data_processor.run()
    end_time = (time.time() - start_time) / (60 * 60)

    # Upload the preprocessed data to Kaggle
    handle = 'gperdrizet/ariel_training_data'
    local_dataset_dir = '/kaggle/input/ariel-data-challenge-training'
    ignore_patterns = ['*.ipynb', '*.json', '*.html', '*.css']
    
    kagglehub.dataset_upload(
        handle,
        local_dataset_dir,
        ignore_patterns=ignore_patterns
    )

    input_data_path = '/kaggle/working'
    
    print(f'Finished data preprocessing in {end_time:.2f} hours')

Using training data from Kaggle datasets


## 2.2. Initalize data generators

In [3]:
data_preprocessor = DataProcessor(
    input_data_path='/kaggle/input/ariel-data-challenge-2025',
    output_data_path='/kaggle/input/ariel-training-data',
    mode='train',
)

data_preprocessor.initialize_data_generators(
    sample_size=SAMPLE_SIZE,
    validation=False
)

I0000 00:00:1758499118.173589     203 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


## 3. CNN

### 3.1. Model definition

In [8]:
sample_size = 372
batch_size = 4
steps = 431
learning_rate = 0.0007103203515277739
l_one = 0.9381346432258663
l_two = 0.36282682418942663
cnn_layers = 3
first_filter_set = 73
second_filter_set = 34
third_filter_set = 48
first_filter_size = 2
second_filter_size = 5
third_filter_size = 3
dense_units = 104
beta_one=0.72
beta_two=0.93
amsgrad=True
weight_decay=0.016
use_ema=True

# Long training run
epochs = 10

total_ksteps = int((epochs * steps) / 1000)
model_save_file = f'/kaggle/working/ariel_cnn-8.4M-{total_ksteps}ksteps.keras'

In [10]:
def compile_model(
        samples: int=sample_size,
        wavelengths: int=wavelengths,
        learning_rate: float=learning_rate,
        l1: float=l_one,
        l2: float=l_two,
        first_filter_set: int=first_filter_set,
        second_filter_set: int=second_filter_set,
        third_filter_set: int=third_filter_set,
        first_filter_size: int=first_filter_size,
        second_filter_size: int=second_filter_size,
        third_filter_size: int=third_filter_size,
        dense_units: int=dense_units,
        beta_one: float=beta_one,
        beta_two: float=beta_two,
        amsgrad: bool=amsgrad,
        weight_decay: float=weight_decay,
        use_ema: bool=use_ema
) -> tf.keras.Model:

    '''Builds the convolutional neural network regression model'''

    # Set-up the L1L2 for the dense layers
    regularizer = tf.keras.regularizers.L1L2(l1=l1, l2=l2)

    # Define the model layers in order
    model = tf.keras.Sequential([
        tf.keras.layers.Input((samples,wavelengths,1)),
        tf.keras.layers.Conv2D(
            first_filter_set,
            first_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            second_filter_set,
            second_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(
            third_filter_set,
            third_filter_size,
            padding='same',
            activation='relu',
        ),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(
            dense_units,
            kernel_regularizer=regularizer,
            activation='relu',
        ),
        tf.keras.layers.Dense(wavelengths, activation='linear')
    ])

    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=learning_rate,
        beta_1=beta_one,
        beta_2=beta_two,
        amsgrad=amsgrad,
        weight_decay=weight_decay,
        use_ema=use_ema
    )

    # Compile the model, specifying the type of loss to use during training 
    # and any extra metrics to evaluate
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.MeanSquaredError(name='MSE'),
        metrics=[
            tf.keras.metrics.RootMeanSquaredError(name='RMSE')
        ]
    )

    return model

In [11]:
model = compile_model()
model.summary()

### 3.2. Model training

In [12]:
if Path(model_save_file).exists() and Path(training_results_save_file).exists():

    print(f'Found existing model for {total_ksteps} ksteps, skipping training.')

    # Load the existing model
    model = tf.keras.models.load_model(model_save_file)

    # Load existing training results
    with open(training_results_save_file, 'rb') as input_file:
        training_results = pickle.load(input_file)

else:

  print(f'Training model for {total_ksteps} ksteps')
  start_time = time.time()

  training_results = model.fit(
    data_preprocessor.training.batch(batch_size),
    epochs=epochs,
    steps_per_epoch=steps,
    verbose=1
  )

  print(f'Training complete in {(time.time() - start_time)/60:.1f} minutes')
  model.save(model_save_file)

  with open(training_results_save_file, 'wb') as output_file:
      pickle.dump(training_results, output_file)

Training model for 4 ksteps
Epoch 1/10


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_2" is incompatible with the layer: expected axis -1 of input shape to have value 77280, but received input with shape (None, 184800)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 883, 283), dtype=float32)
  • training=True
  • mask=None

In [None]:
# Set-up a 1x2 figure for accuracy and binary cross-entropy
fig, axs=plt.subplots(1,2, figsize=(12,4))

# Add the main title
fig.suptitle('CNN training curves', size='large')

# Plot training loss
axs[0].set_title('Training loss (mean squared error)')
axs[0].plot(np.array(training_results.history['loss']), alpha=0.5, label='Training')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('loss')
# axs[0].set_ylim(21, 25)
# axs[0].set_yscale('log')

# Plot training RMSE
axs[1].set_title('Root mean squared error')
axs[1].plot(training_results.history['RMSE'], alpha=0.5, label='Training')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('RMSE')
# axs[2].set_ylim(top=0.014)
axs[1].set_yscale('log')

# Show the plot
fig.tight_layout()