In [None]:
import sys
import os
import numpy as np

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.getcwd()))

from core.RootPreprocessor import RootPreprocessor
from core.DataLoader import DataPreprocessor

## Step 1: Preprocess ROOT Files

Process ROOT files and save to NPZ format for fast loading.

In [None]:
# Configure paths (adjust to your data)
root_file = "/path/to/your/data.root"  # Single file
# OR
root_dir = "/path/to/your/data/"  # Directory with multiple ROOT files

output_npz = "preprocessed_data.npz"
tree_name = "reco"

# Option 1: Process single file
# preprocessor = RootPreprocessor()
# preprocessor.process_root_file(root_file, tree_name, max_events=10000)

# Option 2: Process directory (all ROOT files)
# preprocessor = RootPreprocessor()
# preprocessor.process_root_directory(
#     root_dir,
#     tree_name,
#     pattern="*.root",
#     merge=True,  # Merge into single dataset
#     max_events_per_file=None
# )

# Save to NPZ
# preprocessor.save_to_npz(output_npz)

print(f"Preprocessing complete. Saved to {output_npz}")

## Step 2: Load with DataLoader

The DataLoader automatically detects the RootPreprocessor format and converts it to the structure needed for ML training.

In [None]:
# Load preprocessed data - config is automatically inferred
data_loader = DataPreprocessor.from_npz(output_npz)

# Get configuration
data_config = data_loader.get_data_config()
print("\nData Configuration:")
print(f"  Events: {data_loader.data_length}")
print(f"  Leptons: {data_config.NUM_LEPTONS}")
print(f"  Max jets: {data_config.max_jets}")
print(f"  Jet features: {data_config.jet_features}")
print(f"  Lepton features: {data_config.lepton_features}")
print(f"  Has regression targets: {data_config.has_regression_targets}")
print(f"  Has event weights: {data_config.has_event_weight}")

## Step 3: Access Features

The data is now in the standard format for ML training.

In [None]:
# Access feature arrays
jet_features = data_loader.feature_data['jet']
lepton_features = data_loader.feature_data['lepton']
assignment_labels = data_loader.feature_data['assignment_labels']

print("Feature shapes:")
print(f"  Jets: {jet_features.shape}")  # (n_events, max_jets, n_jet_features)
print(f"  Leptons: {lepton_features.shape}")  # (n_events, 2, n_lepton_features)
print(f"  Assignment labels: {assignment_labels.shape}")  # (n_events, max_jets, 2)

# Access specific features by name
jet_pt = data_loader.get_feature_data('jet', 'pt')
lepton_eta = data_loader.get_feature_data('lepton', 'eta')

print(f"\nJet pT shape: {jet_pt.shape}")
print(f"Lepton eta shape: {lepton_eta.shape}")

# Access regression targets (if available)
if 'regression_targets' in data_loader.feature_data:
    regression_targets = data_loader.feature_data['regression_targets']
    print(f"Regression targets: {regression_targets.shape}")  # (n_events, 2, 3) - 2 neutrinos, px/py/pz

## Step 4: Split Data for Training

Split into training and testing sets.

In [None]:
# Simple train/test split
X_train, y_train, X_test, y_test = data_loader.split_data(
    test_size=0.2,
    random_state=42
)

print("Training set:")
print(f"  Jets: {X_train['jet'].shape}")
print(f"  Leptons: {X_train['lepton'].shape}")
print(f"  Assignment labels: {y_train['assignment_labels'].shape}")

print("\nTest set:")
print(f"  Jets: {X_test['jet'].shape}")
print(f"  Leptons: {X_test['lepton'].shape}")
print(f"  Assignment labels: {y_test['assignment_labels'].shape}")

## Step 5: Visualize Features

Quick visualization of particle kinematics.

In [None]:
import matplotlib.pyplot as plt

# Plot jet pT distribution
jet_pt_flat = jet_features[:, :, 0].flatten()  # All jets from all events
jet_pt_flat = jet_pt_flat[jet_pt_flat != -999.0]  # Remove padding

plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(jet_pt_flat, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Jet pT [GeV]')
plt.ylabel('Count')
plt.title('Jet Transverse Momentum')
plt.yscale('log')

# Plot lepton eta distribution
lepton_eta_flat = lepton_features[:, :, 1].flatten()

plt.subplot(1, 3, 2)
plt.hist(lepton_eta_flat, bins=50, alpha=0.7, color='orange', edgecolor='black')
plt.xlabel('Lepton Î·')
plt.ylabel('Count')
plt.title('Lepton Pseudorapidity')

# Plot number of jets per event
if 'N_jets' in data_loader.feature_data:
    n_jets = data_loader.feature_data['N_jets']
    plt.subplot(1, 3, 3)
    plt.hist(n_jets, bins=range(int(n_jets.min()), int(n_jets.max()) + 2), alpha=0.7, color='green', edgecolor='black')
    plt.xlabel('Number of Jets')
    plt.ylabel('Count')
    plt.title('Jet Multiplicity')

plt.tight_layout()
plt.show()

## Key Features

### Automatic Format Detection
The DataLoader automatically detects whether the NPZ file is in:
- **Flat format** (from RootPreprocessor): Keys like `lep_pt`, `ordered_jet_eta`
- **Structured format** (from DataPreprocessor): Keys like `lepton`, `jet`

### Conversion Logic
When loading flat format, DataLoader:
1. Detects flat format by checking for keys like `lep_pt`, `ordered_jet_pt`
2. Groups features by particle type: `lep_pt`, `lep_eta` â†’ `lepton` array
3. Builds truth labels from `event_lepton_truth_idx` and `ordered_event_jet_truth_idx`
4. Constructs regression targets from neutrino momentum components
5. Infers DataConfig from array shapes and feature names

### Performance Benefits
- **10-100x faster** than ROOT file loading
- Fully vectorized preprocessing (no event loops)
- Pure Python (no C++ compilation needed)
- Compressed NPZ format saves disk space

## Complete Example: Single File

Here's a complete minimal example:

In [None]:
# Preprocess
# preprocessor = RootPreprocessor()
# preprocessor.process_root_file("data.root", "reco")
# preprocessor.save_to_npz("data.npz")

# Load for ML
# data_loader = DataPreprocessor.from_npz("data.npz")
# X_train, y_train, X_test, y_test = data_loader.split_data()

# Train your model
# model.fit(X_train, y_train, validation_data=(X_test, y_test))

print("That's it! ðŸŽ‰")