# NeurIPS : notebook by pragnyanramtha

In [1]:
# ===================================================================
# 1.1: Project Setup and Dependencies
# File: setup.py
# ===================================================================
import os
import logging
import torch
import warnings

def setup_project_environment():
    """
    Creates directories, installs packages, and configures logging.
    """
    # --- Directory Creation ---
    print("Creating project directories...")
    directories = ['data', 'notebooks', 'src/data_processing', 'src/models', 
                   'src/evaluation', 'src/utils', 'results/models', 'results/submissions']
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    
    # --- Package Installation ---
    # In a real environment, you would run this in your terminal.
    # We will list the command here for completeness.
    print("\n---")
    print("Run the following command in your terminal to install dependencies:")
    pip_install_command = ("pip install pandas numpy scikit-learn xgboost catboost lightgbm "
                           "tabpfn torch torchvision torchaudio matplotlib seaborn pyarrow fastparquet")
    print(f"$ {pip_install_command}")
    print("---\n")

    # --- CUDA Configuration ---
    print("Checking for CUDA support...")
    is_cuda_available = torch.cuda.is_available()
    print(f"CUDA Available: {is_cuda_available}")
    if not is_cuda_available:
        print("WARNING: CUDA not found. Training will be on CPU.")
    
    # --- Logging and Warnings Configuration ---
    print("Configuring logging...")
    logging.basicConfig(level=logging.INFO, 
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        filename='project_log.log',
                        filemode='w')
    
    # Suppress common warnings for cleaner output
    warnings.filterwarnings('ignore', category=FutureWarning)
    
    print("\nProject setup complete.")
    logging.info("Project environment set up successfully.")

# Execute the setup
setup_project_environment()

Creating project directories...

---
Run the following command in your terminal to install dependencies:
$ pip install pandas numpy scikit-learn xgboost catboost lightgbm tabpfn torch torchvision torchaudio matplotlib seaborn pyarrow fastparquet
---

Checking for CUDA support...
CUDA Available: False
Configuring logging...

Project setup complete.


## Core Data Loading and Preprocessing

This section contains the functions responsible for interacting with the raw data. We begin by creating a robust data loader that handles the large parquet files, performs the critical ADC conversion to restore the data's dynamic range, and includes error handling. We then implement functions to apply the various calibration frames, such as dark subtraction and flat-field correction, to clean the instrumental signatures from the signal.

In [None]:
# ===================================================================
# 2.1 & 2.2: Data Loading and Calibration
# File: src/data_processing/loader.py
# ===================================================================
import pandas as pd
import numpy as np
from pathlib import Path

# --- 2.1: Data Loading Utilities ---

def load_adc_info(data_path):
    """Loads ADC conversion parameters."""
    return pd.read_csv(Path(data_path) / 'adc_info.csv').iloc[0]

def load_signal_data(file_path, adc_params, instrument):
    """Loads a single signal parquet file and applies ADC conversion."""
    try:
        df = pd.read_parquet(file_path)
        raw_signal = df.to_numpy()
        
        # Determine shape based on instrument
        if instrument == 'FGS1':
            reshaped_signal = raw_signal.reshape(-1, 32, 32)
        elif instrument == 'AIRS-CH0':
            reshaped_signal = raw_signal.reshape(-1, 32, 356)
        else:
            raise ValueError("Unknown instrument")
            
        # Apply ADC conversion
        gain = adc_params['gain']
        offset = adc_params['offset']
        signal_float64 = (reshaped_signal / gain + offset).astype(np.float64)
        
        logging.info(f"Successfully loaded and converted {file_path}")
        return signal_float64
    
    except Exception as e:
        logging.error(f"Failed to load or process {file_path}: {e}")
        return None

# --- 2.2: Calibration Data Processing ---

def load_calibration_files(planet_path, instrument, visit):
    """Loads all calibration files for a given instrument and visit."""
    calib_path = Path(planet_path) / f"{instrument}_calibration_{visit}"
    calib_data = {}
    for calib_type in ['dark', 'flat', 'dead', 'linear_corr', 'read']:
        file_path = calib_path / f"{calib_type}.parquet"
        if file_path.exists():
            calib_data[calib_type] = pd.read_parquet(file_path).to_numpy()
    return calib_data

def apply_calibrations(signal_data, calib_data):
    """Applies a simplified calibration pipeline."""
    # This is a simplified example. A real pipeline would be more complex.
    processed_signal = signal_data
    if 'dark' in calib_data:
        processed_signal = processed_signal - calib_data['dark']
    if 'flat' in calib_data:
        # Avoid division by zero
        flat = calib_data['flat']
        flat[flat == 0] = 1
        processed_signal = processed_signal / flat
    
    # Dead pixel masking could be applied here by setting values to NaN or interpolating
    return processed_signal

# --- 2.3: Multi-Visit Combination ---

def process_all_planet_visits(planet_path, adc_params):
    """
    Loads all data for a single planet, handles multiple visits, 
    and applies calibrations.
    """
    planet_path = Path(planet_path)
    processed_data = {'FGS1': [], 'AIRS-CH0': []}

    for instrument in ['FGS1', 'AIRS-CH0']:
        visit_files = sorted(list(planet_path.glob(f'{instrument}_signal_*.parquet')))
        
        for visit_file in visit_files:
            visit_id = visit_file.stem.split('_')[-1]
            
            # Load signal data
            signal_data = load_signal_data(visit_file, adc_params, instrument)
            if signal_data is None: continue
            
            # Load corresponding calibration data
            calib_data = load_calibration_files(planet_path, instrument, visit_id)
            
            # Apply calibrations
            calibrated_signal = apply_calibrations(signal_data, calib_data)
            processed_data[instrument].append(calibrated_signal)
            
    # Combine visits by concatenating along the time axis
    for instrument in processed_data:
        if processed_data[instrument]:
            processed_data[instrument] = np.concatenate(processed_data[instrument], axis=0)
        else:
            processed_data[instrument] = np.array([])
            
    return processed_data

## Comprehensive Feature Engineering Pipeline

With the data loaded and cleaned, we now focus on feature engineering. The strategy is to reduce the dimensionality of the vast time-series data into a compact and informative feature vector. We perform simple aperture photometry to create 1D light curves, extract basic statistical features from these light curves, and combine them with the scaled stellar parameters to form the final input for our models.

In [None]:
# ===================================================================
# 3.1, 3.2 & 3.3: Feature Engineering
# File: src/feature_engineering/builder.py
# ===================================================================
from sklearn.preprocessing import StandardScaler

def create_light_curve(signal_data):
    """
    Creates a simplified 1D light curve from 3D signal data by summing
    all pixel values for each timestamp. This is a basic form of photometry.
    """
    if signal_data.ndim != 3:
        return np.array([])
    # Sum across the spatial dimensions (height and width)
    return np.sum(signal_data, axis=(1, 2))

def extract_temporal_features(light_curve):
    """Extracts basic statistical features from a light curve."""
    if light_curve.size == 0:
        return {'mean': 0, 'std': 0, 'min': 0, 'max': 0}
    
    return {
        'mean': np.mean(light_curve),
        'std': np.std(light_curve),
        'min': np.min(light_curve),
        'max': np.max(light_curve)
    }

def get_stellar_features(planet_id, star_info_df):
    """Retrieves stellar parameters for a given planet_id."""
    return star_info_df[star_info_df['planet_id'] == planet_id].iloc[0]

def build_feature_vector(planet_id, all_calibrated_data, star_info_df):
    """
    Builds a single feature vector for a planet by combining temporal
    and stellar features.
    """
    features = {'planet_id': planet_id}
    
    # Temporal features from light curves
    for instrument in ['FGS1', 'AIRS-CH0']:
        light_curve = create_light_curve(all_calibrated_data[instrument])
        temp_features = extract_temporal_features(light_curve)
        for key, val in temp_features.items():
            features[f'{instrument}_{key}'] = val
            
    # Stellar features
    stellar_params = get_stellar_features(planet_id, star_info_df)
    features.update(stellar_params.to_dict())
    
    return features

## Baseline Models and Evaluation

This section establishes our modeling and evaluation framework. We implement the competition-specific Gaussian Log-Likelihood (GLL) metric, ensuring the heavy FGS1 channel weight is correctly applied. We then create a function to train a simple baseline model, such as Ridge regression, and a validation function to score its performance using a standard train-test split.

In [None]:
# ===================================================================
# 4.1 & 4.2: Baseline Models and Evaluation
# File: src/models/baseline.py and src/evaluation/metrics.py
# ===================================================================
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
import sklearn.multioutput

# --- 4.2: Evaluation Framework ---

def gll_score_single(y_true, y_pred_mean, y_pred_unc):
    """Calculates the Gaussian Log-Likelihood for a single prediction."""
    return -0.5 * (np.log(2 * np.pi) + np.log(y_pred_unc**2) + ((y_true - y_pred_mean)**2) / (y_pred_unc**2))

def calculate_weighted_gll(y_true, y_pred_mean, y_pred_unc, fgs1_weight=57.846):
    """Calculates the final weighted GLL score for the competition."""
    # Ensure inputs are numpy arrays
    y_true = np.asarray(y_true)
    y_pred_mean = np.asarray(y_pred_mean)
    y_pred_unc = np.asarray(y_pred_unc)
    
    scores = gll_score_single(y_true, y_pred_mean, y_pred_unc)
    
    # Apply weights
    weights = np.ones(y_true.shape[1])
    weights[0] = fgs1_weight  # First column is FGS1
    
    weighted_scores = scores * weights
    return np.sum(weighted_scores)


# --- 4.1: Simple Regression Baselines ---

def train_and_evaluate_baseline(X, y):
    """Trains a Ridge Regressor and evaluates it using the GLL score."""
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"Training on {X_train.shape[0]} samples, validating on {X_val.shape[0]} samples.")
    
    # Initialize and train a simple model
    # We use a MultiOutputRegressor to predict all 283 wavelengths at once
    model = sklearn.multioutput.MultiOutputRegressor(Ridge(alpha=1.0))
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_mean = model.predict(X_val)
    
    # Estimate uncertainty: A simple approach is to use the standard deviation
    # of the training residuals as a constant uncertainty for all predictions.
    train_residuals = y_train - model.predict(X_train)
    y_pred_unc = np.std(train_residuals, axis=0)
    
    # Evaluate
    score = calculate_weighted_gll(y_val, y_pred_mean, y_pred_unc)
    print(f"Validation Weighted GLL Score: {score:.4f}")
    
    return model, score