# Ariel data challenge submission

## 1. Notebook set up

In [3]:
# Standard library imports
import random

# Project imports
from ariel_data_preprocessing.data_preprocessing import DataProcessor

# Globals
INPUT_DIRECTORY = '/kaggle/input/ariel-data-challenge-2025'
WORKING_DIRECTORY = '/kaggle/working'
MODE = 'train'
DATA_FILE = 'train.h5'
WAVELENGTHS = 283 # Number of wavelength indicies
SAMPLE_SIZE = 883 # Number of captures per sample
SAMPLES = 10      # Number of sample to draw per planet
N_CPUS = 3
N_PLANETS = 100

## 2. Data preparation

### 2.1. Preprocess the raw data

In [2]:
data_processor = DataProcessor(
    input_data_path=INPUT_DIRECTORY,
    output_data_path=WORKING_DIRECTORY,
    output_file=DATA_FILE,
    n_cpus=N_CPUS,
    downsample_fgs=True,
    n_planets=N_PLANETS,
    mode=MODE
)

data_processor.run()

## 2.2. Planet IDs

In [None]:
# Load corrected/extracted data for a sample planet
with h5py.File(f'{WORKING_DIRECTORY}/{DATA_FILE}', 'r') as hdf:
    planet_ids = list(hdf.keys())

print(f'Found {len(planet_ids)} planets.')

## 2.2. Data generator

In [4]:
def prediction_data_loader(planet_ids: list, data_file: str, sample_size: int = 100, n_samples: int = 10):
    '''Generator that yields signal, spectrum pairs for training/validation/testing.

    Args:
        planet_ids (list): List of planet IDs to include in the generator.
        data_file (str): Path to the HDF5 file containing the data.
        sample_size (int, optional): Number of frames to draw from each planet. Defaults to 100.
    '''

    with h5py.File(data_file, 'r') as hdf:

        while True:
            
            for planet_id in planet_ids:

                signal = hdf[planet_id]['signal'][:]
                samples = []

                for _ in range(n_samples):

                    indices = random.sample(range(signal.shape[0]), sample_size)
                    samples.append(signal[sorted(indices), :])

                yield np.array(spectra)

In [None]:
prediction_data_generator = partial(
    prediction_data_loader,
    planet_ids=planet_ids,
    data_file=f'{WORKING_DIRECTORY}/{DATA_FILE}',
    sample_size=SAMPLE_SIZE,
    n_samples=SAMPLES
)

prediction_dataset = tf.data.Dataset.from_generator(
    prediction_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(SAMPLES, SAMPLE_SIZE, WAVELENGTHS), dtype=tf.float64),
    )
)

data = prediction_dataset.take(planets)
signals = np.array([element[0].numpy() for element in data])
print(f'Signals shape: {signals.shape}')

## 3. Predictions

In [None]:
spectrum_predictions = []

for planet in signals:
    spectrum_predictions.append(model.predict(planet, batch_size=SAMPLES, verbose=0))

spectrum_predictions = np.array(spectrum_predictions)
spectrum_predictions_avg = np.mean(spectrum_predictions, axis=1).flatten()
spectrum_predictions_std = np.std(spectrum_predictions, axis=1).flatten()