# Ariel data challenge submission

## 1. Notebook set up

In [12]:
# Standard library imports
import os
import pickle
import time

# Third party imports
import numpy as np
import pandas as pd
import tensorflow as tf

# Project imports
from ariel_data_preprocessing.data_preprocessing import DataProcessor

mode = 'submission'

if mode == 'testing':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    INPUT_DIRECTORY = 'data/raw'
    OUTPUT_DIRECTORY = 'data/processed'
    REBUILD_DATA = True
    N_PLANETS = 10
    MODEL = 'data/models/ariel-cnn-8.1M-2ksteps-tf2.11.keras'

elif mode == 'submission':
    INPUT_DIRECTORY = '/kaggle/input/ariel-data-challenge-2025'
    OUTPUT_DIRECTORY = '/kaggle/working'
    REBUILD_DATA = True
    N_PLANETS = -1
    MODEL = '/kaggle/input/ariel-cnn/ariel-cnn-8.1M-862ksteps.keras'

## 2. Data preparation

### 2.1. Preprocess the raw data

In [13]:
data_processor = DataProcessor(
    input_data_path=INPUT_DIRECTORY,
    output_data_path=OUTPUT_DIRECTORY,
    output_filename='test.h5',
    n_cpus=4,
    downsample_fgs=True,
    n_planets=N_PLANETS,
    mode='test'
)

In [14]:
if REBUILD_DATA:
    start_time = time.time()
    data_processor.run()
    end_time = time.time()

    print(f'Data preprocessing completed in {(end_time - start_time)/60:.2f} minutes')
    print(f'Total test planets: {len(data_processor.planet_list)}')

Data preprocessing completed in 0.30 minutes
Total test planets: 1


### 2.2. Initialize data generator

In [15]:
data_processor.initialize_data_generators(
    sample_size=372,
    n_samples=10
)

### 2.3. Create dataset

In [16]:
testing_data = data_processor.testing.take(len(data_processor.planet_list))
signals = np.array([element.numpy() for element in testing_data])

print(f'Signals shape: {signals.shape}')

Signals shape: (1, 10, 372, 283)


## 3. Predictions

In [17]:
model = tf.keras.models.load_model(MODEL)

spectrum_predictions = []

for planet in signals:
    spectrum_predictions.append(model.predict(planet, batch_size=10, verbose=0))

spectrum_predictions = np.array(spectrum_predictions)
spectrum_predictions_avg = np.mean(spectrum_predictions, axis=1)
spectrum_predictions_std = np.std(spectrum_predictions, axis=1)

print(f'Spectrum predictions shape: {spectrum_predictions.shape}')
print(f'Spectrum predictions avg shape: {spectrum_predictions_avg.shape}')
print(f'Spectrum predictions std shape: {spectrum_predictions_std.shape}')

Spectrum predictions shape: (1, 10, 283)
Spectrum predictions avg shape: (1, 283)
Spectrum predictions std shape: (1, 283)


## 4. Error correction

In [18]:
with open('/kaggle/input/ariel-error-correction-factors/error_correction_factors.pkl', 'rb') as input_file:
    error_correction_factors = pickle.load(input_file)


scaled_errors = []

for planet_spectrum, planet_error in zip(spectrum_predictions_avg, spectrum_predictions_std):
    scaled_errors.append(planet_error*error_correction_factors)

scaled_errors = np.array(scaled_errors)

## 5. Submission file

In [19]:
submission = np.concatenate(
    (spectrum_predictions_avg, scaled_errors),
    axis=1
)

submission_df = pd.DataFrame(submission)

col_names = [f'wl_{i}' for i in range(1, 284)]
col_names += [f'sigma_{i}' for i in range(1, 284)]
submission_df.columns = col_names

submission_df.insert(0, 'planet_id', data_processor.planet_list)
submission_df['planet_id'] = submission_df['planet_id'].astype(int)

submission_df.to_csv('submission.csv', index=False)