# Pi0 Decay Test Files Analysis

This notebook analyzes the completed CNN autoencoder training results to identify and extract the list of NPY files that were designated for testing but not used in training. This is crucial for understanding which data files can be used for further anomaly detection analysis.

## Training Context
- **Model**: CNN Autoencoder for Pi0 decay anomaly detection
- **Training completed**: 60 epochs
- **Results location**: `/nevis/riverside/data/sc5303/models/cnn_autoencoder_results_20250807_111552/`
- **Test data preserved**: `test_events.pkl` (213MB), `test_events_info.json` (69KB)
- **Dataset**: 483 NPY files containing physics event data

## 1. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import json
import pickle
from pathlib import Path
from datetime import datetime
import glob

print("Libraries imported successfully!")

## 2. Define File Paths and Directories

In [None]:
# Training results directory
results_dir = Path("/nevis/riverside/data/sc5303/models/cnn_autoencoder_results_20250807_111552")

# Original data directory
data_dir = Path("/nevis/riverside/data/sc5303/data/pi0_decay")

# Test events info file
test_info_file = results_dir / "test_events_info.json"
test_events_file = results_dir / "test_events.pkl"

# Output file for unused NPY files list
output_file = Path.cwd() / "unused_test_files.txt"

print(f"Results directory: {results_dir}")
print(f"Data directory: {data_dir}")
print(f"Test info file: {test_info_file}")
print(f"Output file: {output_file}")

# Verify files exist
print(f"\nFile existence check:")
print(f"Test info file exists: {test_info_file.exists()}")
print(f"Test events file exists: {test_events_file.exists()}")
print(f"Data directory exists: {data_dir.exists()}")

## 3. Load Test Data Information

Load the test events information to understand which files were used for testing and which event indices were selected.

In [None]:
# Load test events info
with open(test_info_file, 'r') as f:
    test_info = json.load(f)

# Extract test event indices and files used
test_indices = test_info['test_indices']
files_used = test_info['files_used']
test_split_seed = test_info['test_split_seed']

print(f"Test split seed: {test_split_seed}")
print(f"Number of test events: {len(test_indices)}")
print(f"Number of files used in training: {len(files_used)}")

print(f"\nFirst 10 test event indices: {test_indices[:10]}")
print(f"First 5 files used in training: {files_used[:5]}")

# Create a set of training files for faster lookup
training_files_set = set(files_used)

## 4. Scan Original Dataset for All NPY Files

Find all NPY files in the original dataset directory to get the complete list of available files.

In [None]:
# Find all NPY files in the data directory
npy_pattern = str(data_dir / "*.npy")
all_npy_files = glob.glob(npy_pattern)

# Extract just the filenames (not full paths)
all_npy_filenames = [Path(f).name for f in all_npy_files]

print(f"Total NPY files found in dataset: {len(all_npy_filenames)}")
print(f"First 5 NPY files: {all_npy_filenames[:5]}")

# Create a set for faster lookup
all_files_set = set(all_npy_filenames)

# Verify that training files are subset of all files
training_files_in_dataset = training_files_set.intersection(all_files_set)
missing_training_files = training_files_set - all_files_set

print(f"\nTraining files found in dataset: {len(training_files_in_dataset)}")
print(f"Training files missing from dataset: {len(missing_training_files)}")

if missing_training_files:
    print(f"Missing training files: {list(missing_training_files)[:5]}")

## 5. Identify Files Not Used in Training (Available for Testing)

Compare the complete dataset with training files to identify files that were not used in training and are available for testing/validation.

In [None]:
# Find files that were NOT used in training (available for testing)
unused_files = all_files_set - training_files_set
unused_files_list = sorted(list(unused_files))

print(f"Files NOT used in training (available for testing): {len(unused_files_list)}")
print(f"Files used in training: {len(training_files_set)}")
print(f"Total files in dataset: {len(all_files_set)}")

# Verify the math
print(f"\nVerification:")
print(f"Used + Unused = {len(training_files_set)} + {len(unused_files_list)} = {len(training_files_set) + len(unused_files_list)}")
print(f"Total files = {len(all_files_set)}")
print(f"Math checks out: {len(training_files_set) + len(unused_files_list) == len(all_files_set)}")

# Show some examples of unused files
print(f"\nFirst 10 unused files:")
for i, file in enumerate(unused_files_list[:10]):
    print(f"  {i+1}. {file}")

print(f"\nLast 10 unused files:")
for i, file in enumerate(unused_files_list[-10:], len(unused_files_list)-9):
    print(f"  {i}. {file}")

## 6. Load Test Events Data for Detailed Analysis

Load the actual test events data to understand the structure and content of the test dataset.

In [None]:
# Load test events data
with open(test_events_file, 'rb') as f:
    test_events = pickle.load(f)

print(f"Test events data type: {type(test_events)}")
print(f"Test events shape: {test_events.shape if hasattr(test_events, 'shape') else 'N/A'}")

# If it's a numpy array, show some statistics
if isinstance(test_events, np.ndarray):
    print(f"Test events dtype: {test_events.dtype}")
    print(f"Test events size: {test_events.size}")
    print(f"Memory usage: {test_events.nbytes / (1024*1024):.2f} MB")
    
    # Show shape information for different dimensions
    if len(test_events.shape) > 1:
        print(f"Shape breakdown:")
        for i, dim in enumerate(test_events.shape):
            print(f"  Dimension {i}: {dim}")

# Show relationship between test indices and test events
print(f"\nData relationship:")
print(f"Number of test indices: {len(test_indices)}")
print(f"Test events first dimension: {test_events.shape[0] if hasattr(test_events, 'shape') else 'N/A'}")
print(f"Do they match? {len(test_indices) == test_events.shape[0] if hasattr(test_events, 'shape') else 'Unknown'}")

## 7. Save Results to Files

Save the list of unused NPY files and comprehensive analysis results for future reference.

In [None]:
# Save unused files list to text file
unused_files_txt = Path.cwd() / "unused_npy_files.txt"
with open(unused_files_txt, 'w') as f:
    f.write(f"# NPY Files Not Used in Training - Available for Testing\n")
    f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"# Total unused files: {len(unused_files_list)}\n")
    f.write(f"# Training files: {len(training_files_set)}\n")
    f.write(f"# Total dataset files: {len(all_files_set)}\n\n")
    
    for file in unused_files_list:
        f.write(f"{file}\n")

print(f"Unused files list saved to: {unused_files_txt}")

# Save training files list for comparison
training_files_txt = Path.cwd() / "training_npy_files.txt"
with open(training_files_txt, 'w') as f:
    f.write(f"# NPY Files Used in Training\n")
    f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"# Total training files: {len(training_files_set)}\n\n")
    
    for file in sorted(training_files_set):
        f.write(f"{file}\n")

print(f"Training files list saved to: {training_files_txt}")

# Save comprehensive analysis results
analysis_results = {
    'analysis_date': datetime.now().isoformat(),
    'total_dataset_files': len(all_files_set),
    'training_files_count': len(training_files_set),
    'unused_files_count': len(unused_files_list),
    'test_events_count': len(test_indices),
    'test_split_seed': test_split_seed,
    'training_files': sorted(training_files_set),
    'unused_files': unused_files_list,
    'test_indices': test_indices
}

results_json = Path.cwd() / "pi0_decay_analysis_results.json"
with open(results_json, 'w') as f:
    json.dump(analysis_results, f, indent=2)

print(f"Comprehensive analysis saved to: {results_json}")

## 8. Summary and Final Statistics

Display comprehensive statistics and summary of the analysis results.

In [None]:
# Create comprehensive summary
print("="*60)
print("         PI0 DECAY CNN AUTOENCODER - TEST FILES ANALYSIS")
print("="*60)

print(f"\nDATASET OVERVIEW:")
print(f"  Total NPY files in dataset: {len(all_files_set)}")
print(f"  Files used for training: {len(training_files_set)} ({len(training_files_set)/len(all_files_set)*100:.1f}%)")
print(f"  Files available for testing: {len(unused_files_list)} ({len(unused_files_list)/len(all_files_set)*100:.1f}%)")

print(f"\nTRAINING DETAILS:")
print(f"  Test split seed: {test_split_seed}")
print(f"  Number of test events: {len(test_indices)}")
print(f"  Test events file size: {test_events_file.stat().st_size / (1024*1024):.1f} MB")

print(f"\nFILE STATISTICS:")
print(f"  Unused files (first 20):")
for i, file in enumerate(unused_files_list[:20]):
    print(f"    {i+1:2d}. {file}")

if len(unused_files_list) > 20:
    print(f"    ... and {len(unused_files_list) - 20} more files")

print(f"\nOUTPUT FILES GENERATED:")
print(f"  1. {unused_files_txt.name} - List of {len(unused_files_list)} unused NPY files")
print(f"  2. {training_files_txt.name} - List of {len(training_files_set)} training NPY files")
print(f"  3. {results_json.name} - Comprehensive analysis in JSON format")

print(f"\nNEXT STEPS:")
print(f"  • Use unused NPY files for additional testing/validation")
print(f"  • Implement anomaly detection on test dataset")
print(f"  • Compare results with sparse graph autoencoder")
print(f"  • Analyze model performance on unseen data")

print("="*60)
print("                        ANALYSIS COMPLETE")
print("="*60)