In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import multiprocessing
import concurrent

from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_squared_error

import sklearn.model_selection
import itertools
import sklearn.linear_model
import sklearn.metrics

import kego.plotting.axes_utils
import kego.plotting.timeseries
import kego.plotting.utils_plotting

import kego.plotting

In [None]:
FOLDER_COMPETITION = os.environ["PATH_EFOLDER"] + "ariel-data-challenge-2024/"
# FOLDER_COMPETITION = "/kaggle/input/"
!ls $FOLDER_COMPETITION

In [None]:
!ls /home/kristian/Projects/kego/data/ariel

In [None]:
train_adc_info = pd.read_csv(
    FOLDER_COMPETITION + "train_adc_info.csv", index_col="planet_id"
)
train_labels = pd.read_csv(
    FOLDER_COMPETITION + "train_labels.csv", index_col="planet_id"
)
test_adc_info = pd.read_csv(
    FOLDER_COMPETITION + "test_adc_info.csv", index_col="planet_id"
)
sample_submission = pd.read_csv(
    FOLDER_COMPETITION + "sample_submission.csv", index_col="planet_id"
)

In [None]:
!ls $FOLDER_COMPETITION/train/2193939147

In [None]:
train_labels

In [None]:
wavelengths = pd.read_csv(FOLDER_COMPETITION + "wavelengths.csv")

### Plot spectrum on wavenlength and equal distant scales

In [None]:
train_labels_t = train_labels.T
train_labels_t.index = wavelengths.values[0]
train_labels_t.index.name = "wavelength"
figure, axes = kego.plotting.figures.create_figure_axes()
axes.plot(train_labels_t.index, train_labels_t.iloc[:, 0], label="wavelengths")
axes.plot(
    range(train_labels_t.shape[0])
    / np.max(train_labels_t.shape[0])
    * np.max(train_labels_t.index),
    train_labels_t.iloc[:, 0],
    label="range",
)
axes.legend()

In [None]:
plt.figure(figsize=(6, 2))
plt.title("Histogram of the planets' sizes (regression targets)", fontsize=18)
plt.hist(train_labels.values.ravel(), bins=20, density=True, color="olive")
plt.xlabel(r"Planet's size $(\frac{r}{R})^2$", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.xlim(0, 0.008)
plt.show()

In [None]:
!ls $FOLDER_COMPETITION/train/$planet_id/AIRS-CH0_calibration

In [None]:
planet_id = 14485303
f_signal = pd.read_parquet(
    FOLDER_COMPETITION + f"train/{planet_id}/FGS1_signal.parquet"
)
a_signal = pd.read_parquet(
    FOLDER_COMPETITION + f"train/{planet_id}/AIRS-CH0_signal.parquet"
)
a_cal_dark = pd.read_parquet(
    FOLDER_COMPETITION + f"train/{planet_id}/AIRS-CH0_calibration/dark.parquet"
)
display(f_signal.head(2))
display(a_signal.head(2))

In [None]:
mean_signal = f_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window = 800
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

_, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
ax1.plot(net_signal, label="raw net signal")
ax1.legend()
ax2.plot(smooth_signal, color="c", label="smoothened net signal")
ax2.legend()
ax2.set_xlabel("time")
plt.suptitle("FGS1 light curve", y=0.96)

plt.show()

In [None]:
plt.imshow(a_signal.to_numpy().reshape(11250, 32, 356)[0])

In [None]:
plt.imshow(f_signal.to_numpy().reshape(135000, 32, 32)[0])

In [None]:
mean_signal = a_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window = 100
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

_, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
ax1.plot(net_signal, label="raw net signal")
ax1.legend()
ax2.plot(smooth_signal, color="c", label="smoothened net signal")
ax2.legend()
ax2.set_xlabel("time")
plt.suptitle("FGS1 light curve", y=0.96)

plt.show()

In [None]:
adc_info = train_adc_info
planet_ids = adc_info.index

dataset = "train"
i = 1

f_signal = pd.read_parquet(
    FOLDER_COMPETITION + f"{dataset}/{planet_id}/FGS1_signal.parquet"
)
mean_signal = f_signal.values.mean(axis=1)  # mean over the 32*32 pixels
net_signal = mean_signal[1::2] - mean_signal[0::2]
gain = adc_info.FGS1_adc_gain.values[i]

In [None]:
model = RidgeCV()
train_labels_sel = train_labels.iloc[: train_adc_info.shape[0]]
oof_pred = cross_val_predict(model, train, train_labels_sel)

print(f"# R2 score: {r2_score(train_labels_sel, oof_pred):.3f}")
sigma_pred = mean_squared_error(train_labels_sel, oof_pred, squared=False)
print(f"# Root mean squared error: {sigma_pred:.6f}")

col = 1
plt.scatter(
    oof_pred[:, col],
    train_labels_sel.iloc[:, col],
    s=15,
    c=train_adc_info.iloc[: train_labels_t.shape[0]]["star"],
)
plt.gca().set_aspect("equal")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Comparing y_true and y_pred")
plt.show()