## PE file Preprocessing Pipeline

---
---

STAGE-1
---

---

In [None]:
# Install required packages
!pip install lief pillow numpy pandas scikit-learn matplotlib pennylane

# Import necessary libraries
import lief
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import json
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

print("All dependencies installed and imported successfully!")

## Step-1: PE file Parsing using LIEF
---


In [None]:
def parse_pe_file(pe_file_path):
    """
    Parse PE file using LIEF and extract basic information
    """
    print(f"Parsing PE file: {pe_file_path}")
    
    try:
        binary = lief.PE.parse(pe_file_path)
        if not binary:
            print("Failed to parse PE file")
            return None
            
        print(f"Successfully parsed PE file")
        print(f"Number of sections: {len(binary.sections)}")
        print(f"Entry point: 0x{binary.optional_header.addressof_entrypoint:x}")
        
        return binary
        
    except Exception as e:
        print(f"LIEF parsing error: {e}")
        return None

# Test the function
# binary = parse_pe_file("malware_sample.exe")

## Step-2: Target Section Identification
---


In [None]:
def extract_target_sections(binary):
    """
    Extract the five target sections from PE file
    """
    # Define the five target sections from the paper
    target_sections = [".text", ".data", ".rdata", ".rsrc", ".reloc"]
    section_data = {}
    
    # Extract each target section
    for target_section in target_sections:
        section_found = False
        
        for section in binary.sections:
            # Clean section name (remove null bytes)
            section_name = section.name.strip('\x00')
            
            if section_name == target_section:
                # Extract section content as bytes
                content = section.content
                section_data[target_section] = content
                section_found = True
                
                print(f"Found {target_section}: {len(content)} bytes")
                break
        
        # Handle missing sections (assign -1 score as per paper)
        if not section_found:
            section_data[target_section] = None
            print(f"Section {target_section} not found - will receive -1 score")
    
    return section_data

# Test the function
# section_data = extract_target_sections(binary)

## Step-3: Conversion of Section bytes to 8x8 Grayscale Image
---


In [None]:
def bytes_to_8x8_grayscale_image(byte_content, section_name):
    """
    Convert section bytes to 8x8 grayscale image using Nataraj method
    """
    if byte_content is None:
        # Missing section - return None
        return None
    
    # Target: 8x8 = 64 bytes needed
    target_pixels = 64
    
    if len(byte_content) >= target_pixels:
        # Take first 64 bytes
        pixel_values = list(byte_content[:target_pixels])
    else:
        # Pad with zeros if insufficient bytes
        pixel_values = list(byte_content) + [0] * (target_pixels - len(byte_content))
    
    # Convert to numpy array and reshape to 8x8
    image_array = np.array(pixel_values, dtype=np.uint8).reshape(8, 8)
    
    # Create PIL Image for visualization/saving
    pil_image = Image.fromarray(image_array, mode='L')  # 'L' for grayscale
    
    return image_array, pil_image

# Test the function
# img_array, pil_img = bytes_to_8x8_grayscale_image(section_data['.text'], '.text'

## Step-4: Processing of each sections grayscale image
---


In [None]:
# Process each section
section_images = {}

for section_name, content in section_data.items():
    if content is not None:
        img_array, pil_img = bytes_to_8x8_grayscale_image(content, section_name)
        section_images[section_name] = {
            'array': img_array,
            'pil': pil_img,
            'size': len(content)
        }

        # Save image for visualization
        pil_img.save(f"{section_name.replace('.', '')}_section.png")
        print(f"Saved {section_name} as 8x8 grayscale image")
    else:
        section_images[section_name] = None


## Step-5: Understanding each Sections Significance
---


In [None]:
def explain_section_content(section_name, content_size):
    """
    Explain what each PE section typically contains
    """
    explanations = {
        ".text": f"Executable code and CPU instructions ({content_size} bytes)",
        ".data": f"Initialized global and static variables ({content_size} bytes)",
        ".rdata": f"Read-only data like string literals and constants ({content_size} bytes)",
        ".rsrc": f"Resources like icons, menus, strings - frequently exploited by malware ({content_size} bytes)",
        ".reloc": f"Relocation information for loading at different memory addresses ({content_size} bytes)"
    }

    return explanations.get(section_name, f"Unknown section ({content_size} bytes)")

# Display section information
for section_name, section_info in section_images.items():
    if section_info is not None:
        print(f"{section_name}: {explain_section_content(section_name, section_info['size'])}")
    else:
        print(f"{section_name}: Missing section - will receive -1 score")
