# Ariel data challenge submission

## 1. Notebook set up

In [None]:
# Set notebook root to project root
from helper_functions import set_project_root
set_project_root()

# Standard library imports
import os
import time

# Third party imports
import numpy as np
import pandas as pd
import tensorflow as tf

# Project imports
from ariel_data_preprocessing.data_preprocessing import DataProcessor

mode = 'testing'

if mode == 'testing':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    INPUT_DIRECTORY = 'data/raw'
    OUTPUT_DIRECTORY = 'data/processed'
    REBUILD_DATA = True
    N_PLANETS = 10
    MODEL = 'data/models/ariel-cnn-8.1M-2ksteps-tf2.11.keras'

Working directory: /mnt/arkk/kaggle/ariel-data-challenge


## 2. Data preparation

### 2.1. Preprocess the raw data

In [2]:
data_processor = DataProcessor(
    input_data_path=INPUT_DIRECTORY,
    output_data_path=OUTPUT_DIRECTORY,
    output_filename='test.h5',
    n_cpus=4,
    downsample_fgs=True,
    n_planets=N_PLANETS,
    mode='test'
)

In [3]:
if REBUILD_DATA:
    start_time = time.time()
    data_processor.run()
    end_time = time.time()

    print(f'Data preprocessing completed in {(end_time - start_time)/60:.2f} minutes')

Data preprocessing completed in 1.25 minutes


In [4]:
print(f'Total test planets: {len(data_processor.planet_list)}')

Total test planets: 10


### 2.2. Initialize data generator

In [5]:
data_processor.initialize_data_generators(
    sample_size=372,
    n_samples=10
)

2025-09-22 18:30:05.180365: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6669 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070, pci bus id: 0000:07:00.0, compute capability: 6.1


### 2.3. Create dataset

In [6]:
testing_data = data_processor.testing.take(len(data_processor.planet_list))
signals = np.array([element.numpy() for element in testing_data])

print(f'Signals shape: {signals.shape}')

Signals shape: (10, 10, 372, 283)


## 3. Predictions

In [7]:
model = tf.keras.models.load_model(MODEL)

spectrum_predictions = []

for planet in signals:
    spectrum_predictions.append(model.predict(planet, batch_size=10, verbose=0))

spectrum_predictions = np.array(spectrum_predictions)
spectrum_predictions_avg = np.mean(spectrum_predictions, axis=1)
spectrum_predictions_std = np.std(spectrum_predictions, axis=1)

print(f'Spectrum predictions shape: {spectrum_predictions.shape}')
print(f'Spectrum predictions avg shape: {spectrum_predictions_avg.shape}')
print(f'Spectrum predictions std shape: {spectrum_predictions_std.shape}')

2025-09-22 18:30:07.106924: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8200


Spectrum predictions shape: (10, 10, 283)
Spectrum predictions avg shape: (10, 283)
Spectrum predictions std shape: (10, 283)


## 4. Submission file

In [None]:
submission = np.concatenate(
    (spectrum_predictions_avg, spectrum_predictions_std),
    axis=1
)

submission_df = pd.DataFrame(submission)
submission_df.index = data_processor.planet_list
submission_df.to_csv('data/submission.csv', header=False)

print(f'Submission shape: {submission.shape}')

Submission shape: (10, 566)
