# NeurIPS : notebook by pragnyanramtha

In [None]:
# ===================================================================
# 1.1: Project Setup and Dependencies
# File: setup.py
# ===================================================================
import os
import logging
import torch
import warnings

def setup_project_environment():
    """
    Creates directories, installs packages, and configures logging.
    """
    # --- Directory Creation ---
    print("Creating project directories...")
    directories = ['data', 'notebooks', 'src/data_processing', 'src/models', 
                   'src/evaluation', 'src/utils', 'results/models', 'results/submissions']
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    
    # --- Package Installation ---
    # In a real environment, you would run this in your terminal.
    # We will list the command here for completeness.
    print("\n---")
    print("Run the following command in your terminal to install dependencies:")
    pip_install_command = ("pip install pandas numpy scikit-learn xgboost catboost lightgbm "
                           "tabpfn torch torchvision torchaudio matplotlib seaborn pyarrow fastparquet")
    print(f"$ {pip_install_command}")
    print("---\n")

    # --- CUDA Configuration ---
    print("Checking for CUDA support...")
    is_cuda_available = torch.cuda.is_available()
    print(f"CUDA Available: {is_cuda_available}")
    if not is_cuda_available:
        print("WARNING: CUDA not found. Training will be on CPU.")
    
    # --- Logging and Warnings Configuration ---
    print("Configuring logging...")
    logging.basicConfig(level=logging.INFO, 
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        filename='project_log.log',
                        filemode='w')
    
    # Suppress common warnings for cleaner output
    warnings.filterwarnings('ignore', category=FutureWarning)
    
    print("\nProject setup complete.")
    logging.info("Project environment set up successfully.")

# Execute the setup
setup_project_environment()

## Core Data Loading and Preprocessing

This section contains the functions responsible for interacting with the raw data. We begin by creating a robust data loader that handles the large parquet files, performs the critical ADC conversion to restore the data's dynamic range, and includes error handling. We then implement functions to apply the various calibration frames, such as dark subtraction and flat-field correction, to clean the instrumental signatures from the signal.

In [None]:
# ===================================================================
# 2.1 & 2.2: Data Loading and Calibration
# File: src/data_processing/loader.py
# ===================================================================
import pandas as pd
import numpy as np
from pathlib import Path

# --- 2.1: Data Loading Utilities ---

def load_adc_info(data_path):
    """Loads ADC conversion parameters."""
    return pd.read_csv(Path(data_path) / 'adc_info.csv').iloc[0]

def load_signal_data(file_path, adc_params, instrument):
    """Loads a single signal parquet file and applies ADC conversion."""
    try:
        df = pd.read_parquet(file_path)
        raw_signal = df.to_numpy()
        
        # Determine shape based on instrument
        if instrument == 'FGS1':
            reshaped_signal = raw_signal.reshape(-1, 32, 32)
        elif instrument == 'AIRS-CH0':
            reshaped_signal = raw_signal.reshape(-1, 32, 356)
        else:
            raise ValueError("Unknown instrument")
            
        # Apply ADC conversion
        gain = adc_params['gain']
        offset = adc_params['offset']
        signal_float64 = (reshaped_signal / gain + offset).astype(np.float64)
        
        logging.info(f"Successfully loaded and converted {file_path}")
        return signal_float64
    
    except Exception as e:
        logging.error(f"Failed to load or process {file_path}: {e}")
        return None

# --- 2.2: Calibration Data Processing ---

def load_calibration_files(planet_path, instrument, visit):
    """Loads all calibration files for a given instrument and visit."""
    calib_path = Path(planet_path) / f"{instrument}_calibration_{visit}"
    calib_data = {}
    for calib_type in ['dark', 'flat', 'dead', 'linear_corr', 'read']:
        file_path = calib_path / f"{calib_type}.parquet"
        if file_path.exists():
            calib_data[calib_type] = pd.read_parquet(file_path).to_numpy()
    return calib_data

def apply_calibrations(signal_data, calib_data):
    """Applies a simplified calibration pipeline."""
    # This is a simplified example. A real pipeline would be more complex.
    processed_signal = signal_data
    if 'dark' in calib_data:
        processed_signal = processed_signal - calib_data['dark']
    if 'flat' in calib_data:
        # Avoid division by zero
        flat = calib_data['flat']
        flat[flat == 0] = 1
        processed_signal = processed_signal / flat
    
    # Dead pixel masking could be applied here by setting values to NaN or interpolating
    return processed_signal

# --- 2.3: Multi-Visit Combination ---

def process_all_planet_visits(planet_path, adc_params):
    """
    Loads all data for a single planet, handles multiple visits, 
    and applies calibrations.
    """
    planet_path = Path(planet_path)
    processed_data = {'FGS1': [], 'AIRS-CH0': []}

    for instrument in ['FGS1', 'AIRS-CH0']:
        visit_files = sorted(list(planet_path.glob(f'{instrument}_signal_*.parquet')))
        
        for visit_file in visit_files:
            visit_id = visit_file.stem.split('_')[-1]
            
            # Load signal data
            signal_data = load_signal_data(visit_file, adc_params, instrument)
            if signal_data is None: continue
            
            # Load corresponding calibration data
            calib_data = load_calibration_files(planet_path, instrument, visit_id)
            
            # Apply calibrations
            calibrated_signal = apply_calibrations(signal_data, calib_data)
            processed_data[instrument].append(calibrated_signal)
            
    # Combine visits by concatenating along the time axis
    for instrument in processed_data:
        if processed_data[instrument]:
            processed_data[instrument] = np.concatenate(processed_data[instrument], axis=0)
        else:
            processed_data[instrument] = np.array([])
            
    return processed_data