# PE File Preprocessing Validation

---
---

# Preprocessing Pipeline Validation Functions
---

In [None]:
def validate_angular_embedding_output(angular_embedding_data, labels):
    """
    Validate the output of the BODMAS pipeline
    """
    print("=== BODMAS Pipeline Validation ===\n")
    
    validation_results = {
        'total_samples': len(angular_embedding_data),
        'sections_checked': [".text", ".data", ".rdata", ".rsrc", ".reloc"],
        'validation_passed': True,
        'issues': []
    }
    
    # Check first 10 samples for validation
    for i, pe_data in enumerate(angular_embedding_data[:10]):
        for section in validation_results['sections_checked']:
            if section not in pe_data:
                validation_results['issues'].append(f"Sample {i}: Missing {section} key")
                validation_results['validation_passed'] = False
            else:
                section_data = pe_data[section]
                
                # Check X1 and X2 arrays
                if 'X1' not in section_data or 'X2' not in section_data:
                    validation_results['issues'].append(f"Sample {i}, {section}: Missing X1 or X2")
                    validation_results['validation_passed'] = False
                else:
                    X1, X2 = section_data['X1'], section_data['X2']
                    
                    # Check array lengths
                    if len(X1) != 15 or len(X2) != 15:
                        validation_results['issues'].append(
                            f"Sample {i}, {section}: X1 length {len(X1)}, X2 length {len(X2)} (expected 15 each)"
                        )
                        validation_results['validation_passed'] = False
                    
                    # Check angular range for non-missing sections
                    if not section_data['missing']:
                        if not all(0 <= x <= np.pi/2 for x in X1 if x != -1):
                            validation_results['issues'].append(
                                f"Sample {i}, {section}: X1 values outside [0, π/2] range"
                            )
                        if not all(0 <= x <= np.pi/2 for x in X2 if x != -1):
                            validation_results['issues'].append(
                                f"Sample {i}, {section}: X2 values outside [0, π/2] range"
                            )
    
    # Print validation results
    print(f"Total samples validated: {validation_results['total_samples']}")
    print(f"Validation passed: {validation_results['validation_passed']}")
    
    if validation_results['issues']:
        print(f"\nIssues found ({len(validation_results['issues'])}):")
        for issue in validation_results['issues']:
            print(f"  • {issue}")
    
    return validation_results

# Test validation function
# validation_results = validate_angular_embedding_output(pipeline_results['angular_embedding_data'], pipeline_results['valid_labels'])

# Complete Example Usage
---

In [None]:
def demo_bodmas_pipeline():
    """
    Demonstrate the complete BODMAS preprocessing pipeline with example
    """
    print("=== BODMAS Complete Pipeline Demo ===\n")
    
    # Example PE file paths and labels (replace with your actual data)
    example_pe_files = [
        "sample_malware_1.exe",
        "sample_benign_1.exe", 
        "sample_malware_2.exe",
        "sample_benign_2.exe"
    ]
    example_labels = [1, 0, 1, 0]  # 1 = malware, 0 = benign
    
    print("Note: Replace example_pe_files with actual PE file paths")
    print("Expected workflow:")
    print("1. Stage-1: Extract sections and convert to 8×8 images")
    print("2. Stage-2: Train PCA30 models on section images")
    print("3. Stage-2: Transform to Angular Hybrid embedding")
    print("4. Ready for quantum circuit encoding\n")
    
    # Example of what the pipeline would do:
    print("Pipeline steps:")
    print("✓ parse_pe_file: Parse PE files with LIEF")
    print("✓ extract_target_sections: Extract 5 target sections")
    print("✓ process_sections_to_images: Convert to 8×8 images")
    print("✓ train_pca30_models: Train PCA30 for each section")
    print("✓ process_single_pe_to_angular_embedding: Transform to angular features")
    print("✓ Angular_Hybrid_4: Ready for quantum circuit encoding")
    
    # Show expected output structure
    print("\nExpected output structure for each PE file:")
    example_output = {
        '.text': {
            'X1': "15 angular features ∈ [0, π/2] for qubits [0,1,2,3]",
            'X2': "15 angular features ∈ [0, π/2] for qubits [4,5,6,7]",
            'encoding_type': 'Angular-Hybrid4',
            'missing': False
        },
        '.data': {
            'X1': "15 features of -1.0 (missing section)",
            'X2': "15 features of -1.0 (missing section)",
            'encoding_type': 'missing_section',
            'missing': True
        }
        # Similar structure for .rdata, .rsrc, .reloc
    }
    
    print(json.dumps(example_output, indent=2))
    
    print("\n=== To run the actual pipeline ===")
    print("1. Replace example_pe_files with actual file paths")
    print("2. Run: pipeline_results = complete_bodmas_pipeline(pe_files, labels)")
    print("3. Validate: validate_angular_embedding_output(pipeline_results['angular_embedding_data'], pipeline_results['valid_labels'])")
    print("4. Visualize: visualize_pca_variance(pipeline_results['pca_explained_variance'])")
    print("5. Save: save_angular_embedding_dataset(pipeline_results['angular_embedding_data'], pipeline_results['valid_labels'])")

# Run the demo
demo_bodmas_pipeline()