In [1]:
import os
import pandas as pd
import numpy as np

print("PHOENIX V20 FOUNDATION VALIDATION")
print("=" * 50)

print("Available input directories:")
for item in os.listdir('/kaggle/input'):
    print(f"  {item}")

PHOENIX V20 FOUNDATION VALIDATION
Available input directories:
  ariel-data-challenge-2025
  ariel-2025-regenerated-core-files


In [2]:
import pandas as pd
import os

test_root = '/kaggle/input/ariel-data-challenge-2025/test'
example_id = '1103775'  # or whichever test planet you want to analyze

signal_path = os.path.join(test_root, example_id, 'AIRS-CH0_signal_0.parquet')
airs_data = pd.read_parquet(signal_path)

print(f"Loaded spectrum for test ID {example_id}, shape: {airs_data.shape}")


Loaded spectrum for test ID 1103775, shape: (11250, 11392)


In [3]:
## Cell 3

print("EXTRACTING ATMOSPHERIC SPECTRUM:")
print("-" * 40)

# Define transit window from our findings
transit_start = 5200  # Before minimum 
transit_end = 6600    # After minimum

print(f"Transit window: steps {transit_start} to {transit_end}")

# Calculate transit depth for all detector pixels
transit_depths = []
uncertainties = []

print("Processing all 11,392 detector pixels...")

for i in range(0, airs_data.shape[1], 1000):  # Process in chunks for speed
    print(f"  Processing pixels {i} to {i+1000}")
    
    for pixel_idx in range(i, min(i+1000, airs_data.shape[1])):
        pixel_name = f'column_{pixel_idx}'
        pixel_data = airs_data[pixel_name].values
        
        # Baseline flux (out of transit)
        out_flux_1 = pixel_data[1000:3000].mean()
        out_flux_2 = pixel_data[8000:10000].mean() 
        baseline_flux = (out_flux_1 + out_flux_2) / 2
        
        # Transit flux (minimum during transit)
        transit_flux = pixel_data[transit_start:transit_end].min()
        
        # Transit depth
        depth = (baseline_flux - transit_flux) / baseline_flux
        
        # Simple uncertainty estimate
        baseline_std = np.std(np.concatenate([pixel_data[1000:3000], pixel_data[8000:10000]]))
        uncertainty = baseline_std / baseline_flux
        
        transit_depths.append(depth)
        uncertainties.append(uncertainty)

transit_depths = np.array(transit_depths)
uncertainties = np.array(uncertainties)

print(f"\nSpectrum extracted!")
print(f"  Pixels processed: {len(transit_depths)}")
print(f"  Transit depth range: {transit_depths.min()*100:.2f}% to {transit_depths.max()*100:.2f}%")
print(f"  Mean uncertainty: {np.mean(uncertainties)*100:.3f}%")

EXTRACTING ATMOSPHERIC SPECTRUM:
----------------------------------------
Transit window: steps 5200 to 6600
Processing all 11,392 detector pixels...
  Processing pixels 0 to 1000
  Processing pixels 1000 to 2000
  Processing pixels 2000 to 3000
  Processing pixels 3000 to 4000
  Processing pixels 4000 to 5000
  Processing pixels 5000 to 6000
  Processing pixels 6000 to 7000
  Processing pixels 7000 to 8000
  Processing pixels 8000 to 9000
  Processing pixels 9000 to 10000
  Processing pixels 10000 to 11000
  Processing pixels 11000 to 12000

Spectrum extracted!
  Pixels processed: 11392
  Transit depth range: 3.16% to 92.53%
  Mean uncertainty: 15.773%


In [4]:
## Cell 4

print("IMPROVED SPECTRAL EXTRACTION:")
print("-" * 40)

# More sophisticated approach
improved_depths = []
improved_uncertainties = []

print("Using robust statistics and outlier rejection...")

for pixel_idx in range(0, min(1000, airs_data.shape[1])):  # Test on first 1000 pixels
    pixel_name = f'column_{pixel_idx}'
    pixel_data = airs_data[pixel_name].values
    
    # Robust baseline using median
    out_flux_1 = np.median(pixel_data[1000:3000])
    out_flux_2 = np.median(pixel_data[8000:10000]) 
    baseline_flux = (out_flux_1 + out_flux_2) / 2
    
    # Robust transit depth using median of transit window (not minimum)
    transit_flux = np.median(pixel_data[transit_start:transit_end])
    
    # Calculate depth
    depth = (baseline_flux - transit_flux) / baseline_flux
    
    # Better uncertainty: photon noise + systematic
    baseline_combined = np.concatenate([pixel_data[1000:3000], pixel_data[8000:10000]])
    baseline_rms = np.std(baseline_combined)
    photon_noise = np.sqrt(baseline_flux) / baseline_flux  # Poisson limit
    systematic_noise = baseline_rms / baseline_flux
    total_uncertainty = np.sqrt(photon_noise**2 + systematic_noise**2)
    
    # Reject obviously bad pixels
    if 0.001 < depth < 0.5 and total_uncertainty < 0.1:  # Reasonable bounds
        improved_depths.append(depth)
        improved_uncertainties.append(total_uncertainty)

print(f"\nImproved results:")
print(f"  Good pixels: {len(improved_depths)}/{1000}")
print(f"  Transit depth range: {np.min(improved_depths)*100:.2f}% to {np.max(improved_depths)*100:.2f}%")
print(f"  Mean uncertainty: {np.mean(improved_uncertainties)*100:.3f}%")
print(f"  Median uncertainty: {np.median(improved_uncertainties)*100:.3f}%")

IMPROVED SPECTRAL EXTRACTION:
----------------------------------------
Using robust statistics and outlier rejection...

Improved results:
  Good pixels: 225/1000
  Transit depth range: 0.11% to 0.50%
  Mean uncertainty: 5.345%
  Median uncertainty: 5.177%


In [5]:
## Cell 5

print("FINDING PRECISE TRANSIT TIMING:")
print("-" * 40)

# Find the exact transit duration by looking at flux evolution
# Average across many pixels to reduce noise
mean_flux_evolution = airs_data[['column_1000', 'column_2000', 'column_3000', 'column_4000', 'column_5000']].mean(axis=1)

# Look at the flux in our suspected transit window
transit_window = mean_flux_evolution[transit_start:transit_end]
window_steps = np.arange(transit_start, transit_end)

# Find the baseline within this window - CORRECTED
baseline_segment_1 = mean_flux_evolution[1000:3000]
baseline_segment_2 = mean_flux_evolution[8000:10000]
baseline_combined = np.concatenate([baseline_segment_1, baseline_segment_2])
baseline_level = np.median(baseline_combined)

# Find points significantly below baseline (actual transit)
transit_threshold = baseline_level * 0.98  # 2% depth threshold
in_transit_mask = transit_window < transit_threshold

# Find continuous transit period
in_transit_indices = window_steps[in_transit_mask]

if len(in_transit_indices) > 0:
    true_transit_start = in_transit_indices[0]
    true_transit_end = in_transit_indices[-1]
    transit_duration = true_transit_end - true_transit_start
    
    print(f"Precise transit timing:")
    print(f"  Start: step {true_transit_start}")
    print(f"  End: step {true_transit_end}")
    print(f"  Duration: {transit_duration} steps")
    print(f"  Fraction of orbit: {transit_duration/11250*100:.1f}%")
    
    # Show the difference
    precise_transit_depth = (baseline_level - mean_flux_evolution[true_transit_start:true_transit_end].min()) / baseline_level
    window_median_depth = (baseline_level - transit_window.median()) / baseline_level
    
    print(f"\nDepth comparison:")
    print(f"  Using precise timing: {precise_transit_depth*100:.2f}%")
    print(f"  Using window median: {window_median_depth*100:.2f}%")
else:
    print("No clear transit found - need to adjust threshold")

FINDING PRECISE TRANSIT TIMING:
----------------------------------------
Precise transit timing:
  Start: step 5200
  End: step 6598
  Duration: 1398 steps
  Fraction of orbit: 12.4%

Depth comparison:
  Using precise timing: 6.66%
  Using window median: -0.22%


In [6]:
## Cell 6

import os
import pandas as pd
import numpy as np

# --- Load spectrum for a test planet ---
test_root = '/kaggle/input/ariel-data-challenge-2025/test'
example_id = '1103775'  # Use your target test ID

signal_path = os.path.join(test_root, example_id, 'AIRS-CH0_signal_0.parquet')
airs_data = pd.read_parquet(signal_path)

print(f"Loaded spectrum for test ID {example_id}, shape: {airs_data.shape}")

# --- Define precise transit timing ---
precise_start = 5298
precise_end = 5398

print(f"Using precise transit timing: {precise_start} to {precise_end}")

# --- Extract corrected depths and uncertainties ---
corrected_depths = []
corrected_uncertainties = []

print("Processing all pixels with correct transit timing...")

for pixel_idx in range(0, airs_data.shape[1]):
    pixel_name = f'column_{pixel_idx}'
    pixel_data = airs_data[pixel_name].values
    
    # Baseline flux outside transit
    baseline_combined = np.concatenate([pixel_data[1000:3000], pixel_data[8000:10000]])
    baseline_flux = np.median(baseline_combined)
    
    # Transit flux using precise timing window
    transit_flux = pixel_data[precise_start:precise_end].min()
    
    # Calculate transit depth
    depth = (baseline_flux - transit_flux) / baseline_flux
    
    # Uncertainty estimate
    baseline_rms = np.std(baseline_combined)
    photon_noise = np.sqrt(baseline_flux) / baseline_flux
    systematic_noise = baseline_rms / baseline_flux
    total_uncertainty = np.sqrt(photon_noise**2 + systematic_noise**2)
    
    # Filter good pixels by depth and uncertainty
    if 0.001 < depth < 0.5 and total_uncertainty < 0.1:
        corrected_depths.append(depth)
        corrected_uncertainties.append(total_uncertainty)

corrected_depths = np.array(corrected_depths)
corrected_uncertainties = np.array(corrected_uncertainties)

print(f"Processed {len(corrected_depths)} good pixels.")


Loaded spectrum for test ID 1103775, shape: (11250, 11392)
Using precise transit timing: 5298 to 5398
Processing all pixels with correct transit timing...
Processed 7388 good pixels.


In [7]:
## Cell 7

print("APPLYING REALISTIC QUALITY CUTS & SNR WEIGHTING:")
print("-" * 50)

# Exoplanet transit depth bounds (customize for comp tuning)
DEPTH_MIN = 0.005      # 0.5%
DEPTH_MAX = 0.08       # 8%
UNCERTAINTY_MAX = 0.08 # 8%

realistic_depths = []
realistic_uncertainties = []
pixel_weights = []
rejected_pixels = 0

for i, depth in enumerate(corrected_depths):
    uncertainty = corrected_uncertainties[i]
    # Only basic physical cuts; no hard SNR threshold
    if (DEPTH_MIN < depth < DEPTH_MAX and uncertainty < UNCERTAINTY_MAX):
        realistic_depths.append(depth)
        realistic_uncertainties.append(uncertainty)
        snr = depth / uncertainty
        pixel_weights.append(max(snr, 0.01))  # never zero, never negative
    else:
        rejected_pixels += 1

realistic_depths = np.array(realistic_depths)
realistic_uncertainties = np.array(realistic_uncertainties)
pixel_weights = np.array(pixel_weights)

print(f"REALISTIC SPECTRUM (WEIGHTED):")
print(f"  Good pixels: {len(realistic_depths)}")
print(f"  Rejected pixels: {rejected_pixels}")

if len(realistic_depths) == 0:
    print("  [!] No good pixels after quality cuts. Review or relax thresholds.")
else:
    weighted_mean_depth = np.average(realistic_depths, weights=pixel_weights)
    weighted_mean_uncertainty = np.average(realistic_uncertainties, weights=pixel_weights)
    print(f"  Weighted mean depth: {weighted_mean_depth*100:.2f}%")
    print(f"  Weighted mean uncertainty: {weighted_mean_uncertainty*100:.3f}%")
    print(f"  Pixel weight range: {pixel_weights.min():.2f} to {pixel_weights.max():.2f}")

    # Optional: check for structure across wavelength/pixel order
    if len(realistic_depths) > 10:
        pixel_positions = np.arange(len(realistic_depths))
        correlation = np.corrcoef(pixel_positions, realistic_depths)[0,1]
        print(f"  Correlation with wavelength: {correlation:.3f}")
        print(f"\n  [OK] Looks like a realistic exoplanet atmospheric spectrum!")
    else:
        print("\n  [!] Too few good pixels – may need to relax cuts or debug upstream filters.")


APPLYING REALISTIC QUALITY CUTS & SNR WEIGHTING:
--------------------------------------------------
REALISTIC SPECTRUM (WEIGHTED):
  Good pixels: 6308
  Rejected pixels: 1080
  Weighted mean depth: 5.23%
  Weighted mean uncertainty: 5.642%
  Pixel weight range: 0.32 to 1.49
  Correlation with wavelength: 0.084

  [OK] Looks like a realistic exoplanet atmospheric spectrum!


In [8]:
## Cell 8

import os
import pandas as pd
import numpy as np

test_root = '/kaggle/input/ariel-data-challenge-2025/test'
test_ids = sorted(os.listdir(test_root))

test_depths = []
test_uncertainties = []

for tid in test_ids:
    # Load the main signal file for each spectrum
    signal_path = os.path.join(test_root, tid, 'AIRS-CH0_signal_0.parquet')
    signal_df = pd.read_parquet(signal_path)
    # Each column is a pixel; take mean for depth, std for uncertainty
    pixel_means = signal_df.mean(axis=0).values.astype(float)  # shape: (num_pixels,)
    pixel_stds = signal_df.std(axis=0).values.astype(float)
    test_depths.append(pixel_means)
    test_uncertainties.append(pixel_stds)

print("Extraction complete.")
print("Example pixel depths:", test_depths[0][:10])
print("Example uncertainties:", test_uncertainties[0][:10])


Extraction complete.
Example pixel depths: [436.52275556 436.43404444 436.44275556 436.57946667 436.51306667
 436.36737778 436.44266667 436.39368889 436.52675556 436.68675556]
Example uncertainties: [5.68978069 5.5559412  5.63463404 6.04049994 7.60280873 6.21899101
 7.04262021 5.78347685 5.37288099 5.37327542]


In [16]:
## Cell 9

def weighted_depth(corrected_depths, corrected_uncertainties,
                   depth_min=0.005, depth_max=0.08, uncertainty_max=0.08):
    """
    Returns SNR-weighted mean depth for a single spectrum,
    using only physically realistic pixels.
    """
    realistic_depths = []
    pixel_weights = []
    for d, u in zip(corrected_depths, corrected_uncertainties):
        if (depth_min < d < depth_max and u < uncertainty_max):
            snr = d / u
            pixel_weights.append(max(snr, 0.01))   # floor weight to avoid zeros
            realistic_depths.append(d)
    if not realistic_depths:
        # Fallback: mean of ALL pixels if none pass physical cuts
        return float(np.mean(corrected_depths))
    realistic_depths = np.array(realistic_depths)
    pixel_weights = np.array(pixel_weights)
    return float(np.average(realistic_depths, weights=pixel_weights))


In [17]:
submission_rows = []

for i in range(len(test_ids)):
    pred = weighted_depth(test_depths[i], test_uncertainties[i])
    submission_rows.append((test_ids[i], pred))

print(f"Prediction loop complete: {len(submission_rows)} entries created.")
print("Sample submission rows:", submission_rows[:3])


Prediction loop complete: 1 entries created.
Sample submission rows: [('1103775', 699.9715951700999)]


In [18]:
## Cell 10

# Create submission DataFrame from prediction results
submission = pd.DataFrame(submission_rows, columns=["ID", "transit_depth"])

# Validate submission before saving
print("Submission columns:", submission.columns.tolist())
print("Submission shape:", submission.shape)
print("Null values per column:\n", submission.isnull().sum())
print("Any infinite values in 'transit_depth':", ~np.isfinite(submission['transit_depth']).all())
print("Sample submission rows:\n", submission.head())

# Save submission CSV file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Submission columns: ['ID', 'transit_depth']
Submission shape: (1, 2)
Null values per column:
 ID               0
transit_depth    0
dtype: int64
Any infinite values in 'transit_depth': False
Sample submission rows:
         ID  transit_depth
0  1103775     699.971595
Submission file saved as 'submission.csv'


In [None]:
# Cell 19: Submission sanity check

import matplotlib.pyplot as plt
import numpy as np

print("Quick submission summary:")
print("Shape:", submission.shape)
print("First few rows:\n", submission.head())

plt.hist(submission['transit_depth'], bins=50)
plt.xlabel("Predicted Transit Depth")
plt.ylabel("Count")
plt.title("Submission Distribution")
plt.show()

# Check for bad values
print("NaNs in submission:", submission['transit_depth'].isna().sum())
print("Infs in submission:", (~np.isfinite(submission['transit_depth'])).sum())


In [None]:
# Cell 20: Fallback usage diagnostics

num_fallbacks = 0
for i in range(len(test_ids)):
    # Count if all pixels in a spectrum fail the physical cuts
    use_fallback = not any(
        (0.005 < d < 0.08 and u < 0.08)
        for d, u in zip(test_depths[i], test_uncertainties[i])
    )
    if use_fallback:
        num_fallbacks += 1

print(f"Number of fallback predictions used: {num_fallbacks} out of {len(test_ids)}")
if num_fallbacks > 0:
    print("Consider investigating spectra with all pixels outside physical range.")
