# PE File Preprocessing Validation

---
---

# Preprocessing Pipeline Validation Functions
---

In [None]:
def validate_angular_embedding_output(angular_embedding_data, labels):
    """
    Validate the output of the BODMAS pipeline
    """
    print("=== BODMAS Pipeline Validation ===\n")
    
    validation_results = {
        'total_samples': len(angular_embedding_data),
        'sections_checked': [".text", ".data", ".rdata", ".rsrc", ".reloc"],
        'validation_passed': True,
        'issues': []
    }
    
    # Check first 10 samples for validation
    for i, pe_data in enumerate(angular_embedding_data[:10]):
        for section in validation_results['sections_checked']:
            if section not in pe_data:
                validation_results['issues'].append(f"Sample {i}: Missing {section} key")
                validation_results['validation_passed'] = False
            else:
                section_data = pe_data[section]
                
                # Check X1 and X2 arrays
                if 'X1' not in section_data or 'X2' not in section_data:
                    validation_results['issues'].append(f"Sample {i}, {section}: Missing X1 or X2")
                    validation_results['validation_passed'] = False
                else:
                    X1, X2 = section_data['X1'], section_data['X2']
                    
                    # Check array lengths
                    if len(X1) != 15 or len(X2) != 15:
                        validation_results['issues'].append(
                            f"Sample {i}, {section}: X1 length {len(X1)}, X2 length {len(X2)} (expected 15 each)"
                        )
                        validation_results['validation_passed'] = False
                    
                    # Check angular range for non-missing sections
                    if not section_data['missing']:
                        if not all(0 <= x <= np.pi/2 for x in X1 if x != -1):
                            validation_results['issues'].append(
                                f"Sample {i}, {section}: X1 values outside [0, π/2] range"
                            )
                        if not all(0 <= x <= np.pi/2 for x in X2 if x != -1):
                            validation_results['issues'].append(
                                f"Sample {i}, {section}: X2 values outside [0, π/2] range"
                            )
    
    # Print validation results
    print(f"Total samples validated: {validation_results['total_samples']}")
    print(f"Validation passed: {validation_results['validation_passed']}")
    
    if validation_results['issues']:
        print(f"\nIssues found ({len(validation_results['issues'])}):")
        for issue in validation_results['issues']:
            print(f"  • {issue}")
    
    return validation_results

# Test validation function
# validation_results = validate_angular_embedding_output(pipeline_results['angular_embedding_data'], pipeline_results['valid_labels'])