In [1]:
import time
from ML4transients.data_access import DatasetLoader
from ML4transients.training import PytorchDataset

# Load the dataset (index-only loading)
print("Loading dataset...")
t0 = time.time()
dataset = DatasetLoader('/sps/lsst/groups/transients/HSC/fouchez/raphael/UDEEP_COSMOS2')
print(f"Dataset loaded in {time.time() - t0:.2f} seconds")

# Create PyTorch dataset (should load only labels)
print("\nCreating PyTorch dataset (loads only labels)...")
t1 = time.time()
datasets = PytorchDataset.create_splits(dataset, random_state=42)
print(f"split created in {time.time() - t1:.2f} seconds")

# Access label of first sample (fast, from preloaded label column)
print("\ncreating val...")
t2 = time.time()
val_dataset = datasets['val'] 
print(f"Val label: {val_dataset} (retrieved in {time.time() - t2:.4f} seconds)")

print("\ncreating train...")
t3 = time.time()
train_dataset = datasets['train'] 
print(f"train label: {train_dataset} (retrieved in {time.time() - t3:.4f} seconds)")

Loading dataset...
Dataset loaded in 0.13 seconds

Creating PyTorch dataset (loads only labels)...
Building sample index...
Creating splits from 8490713 samples...
Loading 5943498 cutouts...
Loading 849072 cutouts...
Loading 1698143 cutouts...
split created in 1024.36 seconds

creating val...
Val label: PytorchDataset(849072 samples)
  Image shape: (849072, 30, 30)
  Labels: 0 injected, 849072 real (retrieved in 0.0027 seconds)

creating train...
train label: PytorchDataset(5943498 samples)
  Image shape: (5943498, 30, 30)
  Labels: 0 injected, 5943498 real (retrieved in 0.0156 seconds)
