In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from tqdm import tqdm
import random, os
import itertools
from astropy.stats import sigma_clip

In [2]:
test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/test_adc_info.csv',
                           index_col='planet_id')
axis_info = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/axis_info.parquet')

In [3]:
def apply_linear_corr(linear_corr,clean_signal):
    linear_corr = np.flip(linear_corr, axis=0)
    for x, y in itertools.product(
                range(clean_signal.shape[1]), range(clean_signal.shape[2])
            ):
        poli = np.poly1d(linear_corr[:, x, y])
        clean_signal[:, x, y] = poli(clean_signal[:, x, y])
    return clean_signal

def clean_dark(signal, dark, dt):
    dark = np.tile(dark, (signal.shape[0], 1, 1))
    signal -= dark* dt[:, np.newaxis, np.newaxis]
    return signal

def preproc(dataset, adc_info, sensor, binning = 15):
    cut_inf, cut_sup = 39, 321
    sensor_sizes_dict = {"AIRS-CH0":[[11250, 32, 356], [1, 32, cut_sup-cut_inf]], "FGS1":[[135000, 32, 32], [1, 32, 32]]}
    binned_dict = {"AIRS-CH0":[11250 // binning // 2, 282], "FGS1":[135000 // binning // 2]}
    linear_corr_dict = {"AIRS-CH0":(6, 32, 356), "FGS1":(6, 32, 32)}
    planet_ids = adc_info.index
    
    feats = []
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/{sensor}_signal.parquet').to_numpy()
        dark_frame = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/' + str(planet_id) + '/' + sensor + '_calibration/dark.parquet', engine='pyarrow').to_numpy()
        dead_frame = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/' + str(planet_id) + '/' + sensor + '_calibration/dead.parquet', engine='pyarrow').to_numpy()
        flat_frame = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/' + str(planet_id) + '/' + sensor + '_calibration/flat.parquet', engine='pyarrow').to_numpy()
        linear_corr = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/' + str(planet_id) + '/' + sensor + '_calibration/linear_corr.parquet').values.astype(np.float64).reshape(linear_corr_dict[sensor])

        signal = signal.reshape(sensor_sizes_dict[sensor][0]) 
        gain = adc_info[f'{sensor}_adc_gain'].values[i]
        offset = adc_info[f'{sensor}_adc_offset'].values[i]
        signal = signal / gain + offset
        
        hot = sigma_clip(
            dark_frame, sigma=5, maxiters=5
        ).mask
        
        if sensor != "FGS1":
            signal = signal[:, :, cut_inf:cut_sup] #11250 * 32 * 282
            #dt = axis_info['AIRS-CH0-integration_time'].dropna().values
            dt = np.ones(len(signal))*0.1 
            dt[1::2] += 4.5 #@bilzard idea
            linear_corr = linear_corr[:, :, cut_inf:cut_sup]
            dark_frame = dark_frame[:, cut_inf:cut_sup]
            dead_frame = dead_frame[:, cut_inf:cut_sup]
            flat_frame = flat_frame[:, cut_inf:cut_sup]
            hot = hot[:, cut_inf:cut_sup]
        else:
            dt = np.ones(len(signal))*0.1
            dt[1::2] += 0.1
            
        signal = signal.clip(0) #@graySnow idea
        linear_corr_signal = apply_linear_corr(linear_corr, signal)
        signal = clean_dark(linear_corr_signal, dark_frame, dt)
        
        flat = flat_frame.reshape(sensor_sizes_dict[sensor][1])
        flat[dead_frame.reshape(sensor_sizes_dict[sensor][1])] = np.nan
        flat[hot.reshape(sensor_sizes_dict[sensor][1])] = np.nan
        signal = signal / flat
        
        if sensor == "FGS1":
            signal = signal.reshape((sensor_sizes_dict[sensor][0][0], sensor_sizes_dict[sensor][0][1]*sensor_sizes_dict[sensor][0][2]))
        
        mean_signal = np.nanmean(signal, axis=1) # mean over the 32*32(FGS1) or 32(CH0) pixels
        cds_signal = (mean_signal[1::2] - mean_signal[0::2])
        
        binned = np.zeros((binned_dict[sensor]))
        for j in range(cds_signal.shape[0] // binning):
            binned[j] = cds_signal[j*binning:j*binning+binning].mean(axis=0)
                   
        if sensor == "FGS1":
            binned = binned.reshape((binned.shape[0],1))
            
        feats.append(binned)
        
    return np.stack(feats)

In [4]:
pre_test_FGS = preproc('test', test_adc_info, "FGS1", 30*6)
pre_test_AIRS = preproc('test', test_adc_info, "AIRS-CH0", 15)

100%|██████████| 1/1 [00:06<00:00,  6.31s/it]
100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


In [5]:
pre_test = np.concatenate([pre_test_FGS, pre_test_AIRS], axis=2)
pre_test = pre_test[:, :, ::-1]

In [6]:
def phase_detector(signal):
    phase1, phase2 = None, None
    best_drop = 0
    for i in range(50,150):
        t1 = signal[i:i+20].max() - signal[i:i+20].min()
        if t1 > best_drop:
            phase1 = i+20+5
            best_drop = t1
    
    best_drop = 0
    for i in range(200,300):
        t1 = signal[i:i+20].max() - signal[i:i+20].min()
        if t1 > best_drop:
            phase2 = i-5
            best_drop = t1
    
    return phase1, phase2

In [7]:
test = pre_test.copy()
in_transit = []
out_of_transit= []

for i in range(len(test_adc_info)):
    p1,p2 = phase_detector(pre_test[i,:,1:].mean(axis=1))
    test[i] = (test[i] - pre_test[i,p1:p2].mean(axis=0)) / pre_test[i,list(range(p1-40)) + list(range(p2+40,375))].mean(axis=0) * 1000.0
    in_transit.append(test[i,p1:p2,:])
    out_of_transit.append([test[i,:p1, :]]+[test[i, p2:, :]])

In [8]:
all_predictions = []
for i in range(len(test_adc_info)):
    estimated_points = np.zeros(283,)
    for j in range(283):
        mean_entire = (np.mean(out_of_transit[i][0][:,j]) +
            np.mean(out_of_transit[i][1][:,j]) +
            np.mean(in_transit[i][:,j])) / 3
        estimated_points[j] = mean_entire
    estimated_mean = np.mean(estimated_points/1000)/0.598
    all_predictions.append(estimated_mean)

In [9]:
test_preds = np.repeat(np.array(all_predictions), 283).reshape((len(all_predictions), 283))
test_sigmas = np.ones_like(test_preds) * 0.000249

### Making submission

In [10]:
ss = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/sample_submission.csv')

preds = test_preds.clip(0)
sigmas = test_sigmas
submission = pd.DataFrame(np.concatenate([preds,sigmas], axis=1), columns=ss.columns[1:])
submission.index = test_adc_info.index
submission.to_csv('submission.csv')

In [11]:
submission

Unnamed: 0_level_0,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,wl_10,...,sigma_274,sigma_275,sigma_276,sigma_277,sigma_278,sigma_279,sigma_280,sigma_281,sigma_282,sigma_283
planet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
499191466,0.002656,0.002656,0.002656,0.002656,0.002656,0.002656,0.002656,0.002656,0.002656,0.002656,...,0.000249,0.000249,0.000249,0.000249,0.000249,0.000249,0.000249,0.000249,0.000249,0.000249
