## PE file Preprocessing Pipeline

---
---

## Step-1: PE file Parsing using LIEF
---


In [None]:
import lief

# Parse the PE file using LIEF
binary = lief.PE.parse("malware_sample.exe")

# Verify successful parsing
if not binary:
    print("Failed to parse PE file")
    exit(1)

# Display basic PE information
print(f"Number of sections: {len(binary.sections)}")
print(f"Entry point: 0x{binary.optional_header.addressof_entrypoint:x}")


## Step-2: Target Section Identification
---


In [None]:
# Define the five target sections from the paper
target_sections = [".text", ".data", ".rdata", ".rsrc", ".reloc"]

section_data = {}

# Extract each target section
for target_section in target_sections:
    section_found = False

    for section in binary.sections:
        # Clean section name (remove null bytes)
        section_name = section.name.strip('\x00')

        if section_name == target_section:
            # Extract section content as bytes
            content = section.content
            section_data[target_section] = content
            section_found = True

            print(f"Found {target_section}: {len(content)} bytes")
            break

    # Handle missing sections (assign -1 score as per paper)
    if not section_found:
        section_data[target_section] = None
        print(f"Section {target_section} not found - will receive -1 score")


## Step-3: Conversion of Section bytes to 8x8 Grayscale Image
---


In [None]:
import numpy as np
from PIL import Image

def bytes_to_8x8_grayscale_image(byte_content, section_name):
    """
    Convert section bytes to 8x8 grayscale image using Nataraj method
    """
    if byte_content is None:
        # Missing section - create empty image or use -1 indicator
        return None

    # Target: 8x8 = 64 bytes needed
    target_pixels = 64

    if len(byte_content) >= target_pixels:
        # Take first 64 bytes
        pixel_values = list(byte_content[:target_pixels])
    else:
        # Pad with zeros if insufficient bytes
        pixel_values = list(byte_content) + [0] * (target_pixels - len(byte_content))

    # Convert to numpy array and reshape to 8x8
    image_array = np.array(pixel_values, dtype=np.uint8).reshape(8, 8)

    # Create PIL Image for visualization/saving
    pil_image = Image.fromarray(image_array, mode='L')  # 'L' for grayscale

    return image_array, pil_image




## Step-3: Processing of each sections grayscale image
---


In [None]:
# Process each section
section_images = {}

for section_name, content in section_data.items():
    if content is not None:
        img_array, pil_img = bytes_to_8x8_grayscale_image(content, section_name)
        section_images[section_name] = {
            'array': img_array,
            'pil': pil_img,
            'size': len(content)
        }

        # Save image for visualization
        pil_img.save(f"{section_name.replace('.', '')}_section.png")
        print(f"Saved {section_name} as 8x8 grayscale image")
    else:
        section_images[section_name] = None
