<a href="https://colab.research.google.com/github/pvt-16/kaggle-ariel-data-challenge/blob/master/ariel_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ariel-data-challenge-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F70367%2F9188054%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240908%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240908T183502Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dc6f49456c76029dd52cb428803a9d665a4d85070c39915cc16e0381d7abbbcdaffc5112ccfe340ba7af87cd806f50303879b34b98ce8e5cfa8bd3cb9d8f54e8410ac65bce76c1757ac14fd37111e4d06984b3de63b735771ca035d0ef6b4526d7ae210576e0da003858a6ca288be9713e2798ea7a4c93996bce4638181f96faa8cde13bb6d90b8ec7c9f4903f7e039db5abf088934a17fb0909444cf58b16eb481ebbfb85a74fc4a8689c2a7c000f8dfe67173f9ebe44a82050b38a6614844e23e58325a35273a58b0ca367b87c26e256ba237467d5a197155b48e6b4434390ec9abbab220f4c22c6af9c996b444c9a9b3e3bfb00d10ebbae82c2d686867097d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dask.dataframe as dd


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

In [None]:
input_path = '/kaggle/input/ariel-data-challenge-2024'

In [None]:
labels_df = pd.read_csv(f'{input_path}/train_labels.csv')

In [None]:
labels_df.describe()

Unnamed: 0,planet_id,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,...,wl_274,wl_275,wl_276,wl_277,wl_278,wl_279,wl_280,wl_281,wl_282,wl_283
count,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0,...,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0,673.0
mean,2131767000.0,0.002486,0.002516,0.00251,0.002505,0.002507,0.002499,0.002499,0.002508,0.002505,...,0.002513,0.002513,0.002512,0.002512,0.002512,0.002513,0.002513,0.002512,0.002511,0.00251
std,1200253000.0,0.00172,0.00172,0.001717,0.001717,0.001715,0.001713,0.001714,0.001716,0.001716,...,0.001721,0.001721,0.001721,0.001721,0.001721,0.001722,0.001722,0.001722,0.001722,0.001721
min,785834.0,0.000396,0.000396,0.000396,0.000396,0.000396,0.000396,0.000396,0.000396,0.000396,...,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397
25%,1121250000.0,0.001183,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206,...,0.001207,0.001207,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206,0.001206
50%,2042817000.0,0.001866,0.001905,0.001896,0.001894,0.001896,0.001894,0.001886,0.001896,0.001895,...,0.001897,0.001897,0.001896,0.001896,0.001896,0.001897,0.001897,0.001897,0.001897,0.001896
75%,3101987000.0,0.003527,0.003533,0.00353,0.003529,0.003528,0.003527,0.003529,0.003532,0.003532,...,0.003529,0.003529,0.003529,0.003529,0.003529,0.003529,0.003529,0.003529,0.003529,0.003528
max,4286134000.0,0.007097,0.00708,0.007077,0.007072,0.007071,0.007069,0.00707,0.007072,0.007071,...,0.007233,0.007227,0.007222,0.007223,0.007231,0.007238,0.007239,0.007233,0.007224,0.007217


In [None]:
PLANET_ID = 'planet_id'

In [None]:
labels_df.sort_values(PLANET_ID)

Unnamed: 0,planet_id,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,...,wl_274,wl_275,wl_276,wl_277,wl_278,wl_279,wl_280,wl_281,wl_282,wl_283
0,785834,0.001086,0.001137,0.001131,0.001124,0.001138,0.001131,0.001123,0.001127,0.001120,...,0.001075,0.001076,0.001076,0.001076,0.001074,0.001073,0.001072,0.001073,0.001073,0.001072
1,14485303,0.001835,0.001835,0.001834,0.001833,0.001833,0.001833,0.001833,0.001834,0.001834,...,0.001893,0.001892,0.001892,0.001891,0.001891,0.001891,0.001890,0.001890,0.001889,0.001888
2,17002355,0.002792,0.002814,0.002808,0.002804,0.002809,0.002805,0.002802,0.002805,0.002801,...,0.002784,0.002783,0.002783,0.002783,0.002783,0.002784,0.002784,0.002785,0.002785,0.002784
3,24135240,0.001294,0.001308,0.001308,0.001306,0.001306,0.001303,0.001306,0.001314,0.001314,...,0.001405,0.001404,0.001403,0.001402,0.001401,0.001400,0.001399,0.001397,0.001395,0.001393
4,25070640,0.001987,0.001987,0.001987,0.001987,0.001987,0.001987,0.001987,0.001987,0.001987,...,0.001988,0.001988,0.001988,0.001988,0.001988,0.001988,0.001988,0.001988,0.001988,0.001988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,4249337798,0.000790,0.000828,0.000825,0.000822,0.000830,0.000826,0.000822,0.000823,0.000820,...,0.000798,0.000799,0.000799,0.000799,0.000798,0.000796,0.000796,0.000796,0.000796,0.000796
669,4257395405,0.004063,0.004094,0.004090,0.004094,0.004083,0.004069,0.004076,0.004097,0.004100,...,0.004042,0.004042,0.004041,0.004041,0.004041,0.004041,0.004041,0.004040,0.004040,0.004039
670,4266129805,0.006566,0.006569,0.006568,0.006568,0.006567,0.006567,0.006568,0.006570,0.006569,...,0.006566,0.006566,0.006566,0.006566,0.006566,0.006566,0.006566,0.006566,0.006566,0.006566
671,4273166473,0.005647,0.005700,0.005687,0.005676,0.005687,0.005676,0.005667,0.005673,0.005663,...,0.005603,0.005603,0.005603,0.005602,0.005602,0.005604,0.005606,0.005607,0.005607,0.005605


In [None]:
sel_planet = '1011759019'
labels_df[labels_df[PLANET_ID]== sel_planet]

Unnamed: 0,planet_id,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,...,wl_274,wl_275,wl_276,wl_277,wl_278,wl_279,wl_280,wl_281,wl_282,wl_283


In [None]:
[train/test]/[planet_id]/[AIRS-CH0/FGS1]_calibration/flat.parquet:

Unnamed: 0,planet_id,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,...,wl_274,wl_275,wl_276,wl_277,wl_278,wl_279,wl_280,wl_281,wl_282,wl_283


In [None]:
pd.read_parquet(f'train/{sel_planet}/AIRS-CH0_signal.parquet')

FileNotFoundError: [Errno 2] No such file or directory: 'train/1011759019/AIRS-CH0_signal.parquet'