In [1]:
import sys
sys.path.insert(0, '..')
from core.RootPreprocessor import RootPreprocessor, PreprocessorConfig, preprocess_root_file, preprocess_root_directory
from core import DataPreprocessor, LoadConfig
import numpy as np

In [None]:
# Example: Preprocess to NPZ format
data = preprocess_root_directory(
    input_dir="../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/",
    output_file="../../DATA/nominal.npz",
    tree_name="reco",
    verbose=True,
    save_initial_parton_info=True,
)

Found 20 files in ../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/. Starting processing...


Processing file 1 of 20...

Processing file: ../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/user.kbehr.45417421._000010.output.root
Processing ROOT file: ../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/user.kbehr.45417421._000010.output.root
Total events in file: 2361172
Events passing pre-selection: 1369920
Processing complete.
Processing file 2 of 20...

Processing file: ../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/user.kbehr.45417421._000019.output.root
Processing ROOT file: ../../root_data/mc202507/user.kbehr.410472.PhPy8EG.DAOD_PHYS.e6348_s3681_r13145_r13146_p6697.250627-v1_output/user.kbehr.45417421._000019.output.root
Total events in file: 

## Method 1: Simple Function Interface

The quickest way to preprocess a ROOT file:

In [None]:
# Example: Preprocess to NPZ format
data = preprocess_root_directory(
    input_dir="../../root_data/user.kbehr.802380.Py8EG_Toponium_2L.DAOD_PHYS.e8562_s4231_r13145_r13146_p6697.250918-v1_output",
    output_file="../../DATA/toponium.npz",
    tree_name="reco",
    verbose=True,
    save_initial_parton_info=True
)

In [None]:
loaded = np.load("output.npz")
length = set()
for key in loaded:
    print(key, loaded[key].shape)
    length.add(loaded[key].shape[0])
length = list(length)
length

## Method 2: Using PreprocessorConfig Class

For more control over the preprocessing:

In [None]:
# Configure preprocessing
config = PreprocessorConfig(
    input_path="path/to/input.root",
    output_path="path/to/output.npz",
    tree_name="reco",
    output_format="npz",
    save_nu_flows=True,
    save_initial_parton_info=True,
    verbose=True
)

# Run preprocessing
preprocessor = RootPreprocessor(config)
preprocessor.process()

# Access processed data
data = preprocessor.get_processed_data()

print(f"Events processed: {preprocessor.n_events_processed}")
print(f"Events passed selection: {preprocessor.n_events_passed}")
print(f"Selection efficiency: {preprocessor.n_events_passed/preprocessor.n_events_processed:.2%}")

## Method 3: Processing Multiple ROOT Files from a Directory

You can process entire directories containing multiple ROOT files:

In [None]:
# Option A: Merge all files into a single output
merged_data = preprocess_root_directory(
    input_dir="path/to/root_files_directory/",
    output_path="merged_output.npz",
    tree_name="reco",
    output_format="npz",
    merge=True,  # Combine all files
    pattern="*.root",  # Process all .root files
    save_nu_flows=True,
    verbose=True
)

print(f"Merged {len(merged_data['lep_pt'])} events from multiple files")

# Option B: Process files separately
preprocess_root_directory(
    input_dir="path/to/root_files_directory/",
    output_path="output_directory/",
    tree_name="reco",
    output_format="npz",
    merge=False,  # Keep files separate
    pattern="data_*.root",  # Custom pattern
    save_nu_flows=True,
    verbose=True
)

print("Each ROOT file processed separately")

## Inspecting Preprocessed Data

In [None]:
# Check lepton features
print("Lepton pT shape:", data['lep_pt'].shape)
print("First event lepton pT:", data['lep_pt'][0])

# Check jet features
print("\nJet pT shape:", data['ordered_jet_pt'].shape)
print("First event jet pT:", data['ordered_jet_pt'][0])

# Check derived features
print("\nInvariant mass l1-j shape:", data['m_l1j'].shape)
print("Delta R l1-l2:", data['dR_l1l2'][:5])

# Check truth information
print("\nTruth ttbar mass:", data['truth_ttbar_mass'][:5])
print("Truth top mass:", data['truth_top_mass'][:5])

## Integration with DataPreprocessor

The preprocessed NPZ files can be loaded directly into the ML pipeline:

In [None]:
# Load configuration
load_config = LoadConfig(
    jet_features=["ordered_jet_pt", "ordered_jet_eta", "ordered_jet_phi", "ordered_jet_e"],
    lepton_features=["lep_pt", "lep_eta", "lep_phi", "lep_e"],
    met_features=["met_met", "met_phi"],
    jet_truth_label="ordered_event_jet_truth_idx",
    lepton_truth_label="event_lepton_truth_idx",
    max_jets=6,
    NUM_LEPTONS=2
)
# Load NPZ file
data_preprocessor = DataPreprocessor(load_config)
data_config = data_preprocessor.load_from_npz("output.npz")

print(f"Loaded {data_preprocessor.data_length} events")

In [None]:
data_preprocessor.feature_data.keys()

## Saving DataPreprocessor Output

After further processing with DataPreprocessor, you can save the result:

In [None]:
# After loading and processing data
data_preprocessor.save_to_npz(
    "processed_for_ml.npz",
    include_labels=True,
    compress=True
)

print("Saved processed data for ML training")

## Performance Comparison

Let's compare ROOT vs NPZ I/O performance:

In [None]:
import time

# Time NPZ loading
start = time.time()
data_npz = np.load("path/to/output.npz")
npz_time = time.time() - start

print(f"NPZ loading time: {npz_time:.3f} seconds")

# Time ROOT loading (for comparison)
import uproot
start = time.time()
with uproot.open("path/to/output.root") as file:
    tree = file["reco"]
    data_root = tree.arrays(library="np")
root_time = time.time() - start

print(f"ROOT loading time: {root_time:.3f} seconds")
print(f"Speedup: {root_time/npz_time:.1f}x faster with NPZ!")

## Summary

The Python preprocessing pipeline provides:

- ✅ **No C++ compilation** - works immediately
- ✅ **Identical functionality** to C++ implementation
- ✅ **NPZ format** - 10-100x faster I/O
- ✅ **Better integration** with Python ML pipeline
- ✅ **Directory processing** - handle multiple files at once
- ✅ **Easier to modify** and extend

## Command-Line Usage Examples

### Single File
```bash
python scripts/preprocess_root.py input.root output.npz --format npz --nu-flows
```

### Directory with Merge (all files combined into one)
```bash
python scripts/preprocess_root.py input_dir/ merged_output.npz --format npz --merge
```

### Directory without Merge (separate output files)
```bash
python scripts/preprocess_root.py input_dir/ output_dir/ --format npz --no-merge
```

### Custom File Pattern
```bash
python scripts/preprocess_root.py input_dir/ output.npz --pattern "data_*.root" --merge
```