In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import torch
from torch_geometric.data import Data
import pickle
import math
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'torch_geometric'

In [2]:
def parse_bench_file(file_path):
    """
    Parse BENCH file and convert to NetworkX directed graph
    
    Args:
        file_path: Path to the .bench file
        
    Returns:
        NetworkX DiGraph with gate types as node attributes
    """
    G = nx.DiGraph()
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            
            # Skip comments and empty lines
            if not line or line.startswith('#'):
                continue
                
            if line.startswith('INPUT'):
                # Extract input signal: INPUT(pi00) -> pi00
                signal = line.split('(')[1].split(')')[0]
                G.add_node(signal, gate_type='INPUT', is_primary_input=True)
                
            elif line.startswith('OUTPUT'):
                # Extract output signal: OUTPUT(po0) -> po0
                signal = line.split('(')[1].split(')')[0]
                G.add_node(signal, gate_type='OUTPUT', is_primary_output=True)
                
            elif '=' in line:
                # Parse gate definition: n42 = NOT(pi00) or po0 = AND(n68, n329)
                left, right = line.split('=', 1)
                output_signal = left.strip()
                gate_expr = right.strip()
                
                # Extract gate type and input signals
                gate_type = gate_expr.split('(')[0].strip()
                inputs_str = gate_expr.split('(')[1].split(')')[0]
                input_signals = [s.strip() for s in inputs_str.split(',')]
                
                # Add gate node (if not already added as output)
                if output_signal not in G.nodes():
                    G.add_node(output_signal, 
                              gate_type=gate_type, 
                              is_primary_input=False,
                              is_primary_output=False)
                else:
                    # Update existing node with gate information
                    G.nodes[output_signal]['gate_type'] = gate_type
                
                # Add edges from inputs to this gate
                for input_signal in input_signals:
                    G.add_edge(input_signal, output_signal)
    
    return G

def get_nodes_from_influence_data(influence_df):
    """
    Extract node names from influence DataFrame with proper column handling
    """
    print(f"Analyzing influence DataFrame with columns: {influence_df.columns.tolist()}")
    
    # Look for the Input column (with or without leading space)
    input_columns = ['Input', ' Input', 'input', ' input']
    
    for col in input_columns:
        if col in influence_df.columns:
            nodes = influence_df[col].astype(str).tolist()
            print(f"Found nodes in column '{col}': {nodes[:5]}...")  # Show first 5
            return nodes
    
    print("ERROR: Could not find Input column")
    return []

def map_influence_nodes_to_circuit_nodes(influence_nodes, circuit_graph):
    """
    Map influence file node names (G0, G1, etc.) to circuit node names (pi00, pi01, etc.)
    """
    valid_nodes = []
    node_mapping = {}
    
    for influence_node in influence_nodes:
        # Try different mapping strategies
        circuit_node_candidates = []
        
        if influence_node.startswith('G') and influence_node[1:].isdigit():
            # G0 -> pi00, G1 -> pi01, etc.
            node_num = int(influence_node[1:])
            circuit_node_candidates = [
                f'pi{node_num:02d}',  # pi00, pi01, pi02, etc.
                f'pi{node_num}',      # pi0, pi1, pi2, etc.
                influence_node        # Keep original as fallback
            ]
        else:
            # For other formats, try as-is
            circuit_node_candidates = [influence_node]
        
        # Find which candidate exists in the circuit
        for candidate in circuit_node_candidates:
            if candidate in circuit_graph.nodes():
                valid_nodes.append(candidate)
                node_mapping[influence_node] = candidate
                break
    
    print(f"Mapped {len(valid_nodes)} nodes out of {len(influence_nodes)} influence nodes")
    
    return valid_nodes, node_mapping

def create_hamming_weight_class(hamming_weight, input_count):
    """
    Create classification target based on Hamming weight ratio
    Based on data distribution analysis: 3 classes with natural breakpoints
    """
    try:
        # Calculate ratio using log scale to handle large numbers
        log_hamming = math.log10(hamming_weight) if hamming_weight > 0 else -float('inf')
        log_total_space = input_count * math.log10(2)
        log_ratio = log_hamming - log_total_space
        ratio = 10 ** log_ratio if log_ratio > -float('inf') else 0.0
        
        # Classification based on data distribution analysis
        if ratio < 0.01:
            return 0  # "Ultra-Sparse" - ~85% of data
        elif ratio < 0.06:
            return 1  # "Sparse" - ~10% of data  
        else:
            return 2  # "Dense" - ~5% of data
            
    except (ValueError, OverflowError):
        return 0  # Default to ultra-sparse for problematic cases
    
def create_enhanced_node_features(graph, valid_nodes, influence_df, node_mapping):
    """Fast version focusing on essential features only"""
    gate_type_mapping = {
        'INPUT': 0, 'OUTPUT': 1, 'AND': 2, 'NAND': 3, 
        'OR': 4, 'NOR': 5, 'NOT': 6, 'XOR': 7, 'XNOR': 8
    }
    
    node_features = []
    
    for circuit_node in valid_nodes:
        feature_vector = []
        
        # 1. Gate type (fast)
        gate_type = graph.nodes[circuit_node].get('gate_type', 'UNKNOWN')
        gate_type_onehot = [0] * len(gate_type_mapping)
        if gate_type in gate_type_mapping:
            gate_type_onehot[gate_type_mapping[gate_type]] = 1
        feature_vector.extend(gate_type_onehot)
        
        # 2. Basic structural features (fast)
        in_degree = graph.in_degree(circuit_node)
        out_degree = graph.out_degree(circuit_node)
        feature_vector.extend([in_degree, out_degree])
        
        # 3. Influence value (existing logic)
        influence_value = 0.0
        if circuit_node in node_mapping.values():
            # This is a primary input - get its influence value
            for inf_node, circ_node in node_mapping.items():
                if circ_node == circuit_node:
                    input_col = None
                    for col in ['Input', ' Input', 'input', ' input']:
                        if col in influence_df.columns:
                            input_col = col
                            break
                    
                    if input_col:
                        influence_col = None
                        for col in [' Influence', 'Influence', ' influence', 'influence']:
                            if col in influence_df.columns:
                                influence_col = col
                                break
                        
                        if influence_col:
                            node_rows = influence_df[influence_df[input_col].astype(str) == str(inf_node)]
                            if len(node_rows) > 0:
                                try:
                                    influence_value = float(node_rows[influence_col].values[0])
                                except:
                                    influence_value = 0.0
                    break
        feature_vector.append(influence_value)
        
        # 4. Simple node type indicator
        is_primary_input = 1 if circuit_node in node_mapping.values() else 0
        feature_vector.append(is_primary_input)
        
        node_features.append(feature_vector)
    
    return np.array(node_features, dtype=np.float32)


def prepare_slice_data_classification(circuit_graph, influence_file_path, hamming_weight, slice_label):
    """
    Create PyTorch Geometric Data object for classification
    Includes full circuit context with slice-specific features
    """
    try:
        influence_df = pd.read_csv(influence_file_path)
        print(f"\nProcessing slice {slice_label}")
    except Exception as e:
        print(f"Error reading influence file {influence_file_path}: {e}")
        return None
    
    # Get influence nodes and map to circuit nodes
    influence_nodes = get_nodes_from_influence_data(influence_df)
    if not influence_nodes:
        return None
    
    primary_input_nodes, node_mapping = map_influence_nodes_to_circuit_nodes(influence_nodes, circuit_graph)
    
    if not primary_input_nodes:
        return None
    
    # Include extended context: primary inputs + connected gates
    extended_nodes = set(primary_input_nodes)
    
    # Add immediate successors and their successors for richer context
    for pi_node in primary_input_nodes:
        successors = list(circuit_graph.successors(pi_node))
        extended_nodes.update(successors)
        
        # Add second-hop neighbors (limited to avoid explosion)
        for succ in successors:
            second_hop = list(circuit_graph.successors(succ))
            extended_nodes.update(second_hop[:3])  # Limit to 3 per successor
    
    valid_nodes = list(extended_nodes)
    input_count = len(primary_input_nodes)  # Number of primary inputs in this slice
    
    print(f"Extended to {len(valid_nodes)} nodes (from {len(primary_input_nodes)} primary inputs)")
    
    # Create subgraph with extended node set
    slice_subgraph = circuit_graph.subgraph(valid_nodes).copy()
    print(f"Subgraph has {slice_subgraph.number_of_edges()} edges")
    
    # Create classification target
    class_label = create_hamming_weight_class(hamming_weight, input_count)
    
    # Create node mapping for consistent indexing
    node_to_idx = {node: idx for idx, node in enumerate(valid_nodes)}
    
    # Create enhanced node features
    node_features = create_enhanced_node_features(circuit_graph, valid_nodes, influence_df, node_mapping)
    
    # Create edge index (undirected for GraphSAGE)
    edges = []
    for edge in slice_subgraph.edges():
        if edge[0] in node_to_idx and edge[1] in node_to_idx:
            src_idx = node_to_idx[edge[0]]
            dst_idx = node_to_idx[edge[1]]
            edges.append([src_idx, dst_idx])
            edges.append([dst_idx, src_idx])  # Make undirected
    
    if edges:
        edge_index = np.array(edges).T
    else:
        edge_index = np.array([[], []], dtype=np.int64)
    
    # Create PyTorch Geometric Data object
    data = Data(
        x=torch.tensor(node_features, dtype=torch.float),
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        y=torch.tensor(class_label, dtype=torch.long),  # Classification target
        num_nodes=len(valid_nodes),
        hamming_weight=hamming_weight,  # Keep original for analysis
        input_count=input_count,  # For ratio calculation
        slice_label=slice_label,
        circuit_name=""  # Will be set in process_circuit
    )
    
    return data

def process_circuit(circuit_dir):
    """
    Process all slices for a single circuit with classification targets
    """
    data_list = []
    circuit_name = os.path.basename(circuit_dir)
    
    # Find the bench file
    bench_files = [f for f in os.listdir(circuit_dir) 
                   if f.endswith('.bench') or f.endswith('.txt')]
    
    if not bench_files:
        print(f"No bench file found in {circuit_dir}")
        return data_list
    
    bench_file = bench_files[0]
    bench_path = os.path.join(circuit_dir, bench_file)
    print(f"Using bench file: {bench_file}")
    
    # Parse the complete circuit structure
    try:
        circuit_graph = parse_bench_file(bench_path)
        print(f"Parsed circuit with {circuit_graph.number_of_nodes()} nodes and {circuit_graph.number_of_edges()} edges")
    except Exception as e:
        print(f"Error parsing bench file {bench_path}: {e}")
        return data_list
    
    # Load hamming distance data
    hamming_path = os.path.join(circuit_dir, 'hamming_distance.csv')
    if not os.path.exists(hamming_path):
        print(f"Hamming distance file not found: {hamming_path}")
        return data_list
    
    try:
        hamming_df = pd.read_csv(hamming_path)
        print(f"Loaded hamming data with columns: {hamming_df.columns.tolist()}")
    except Exception as e:
        print(f"Error reading hamming distance file: {e}")
        return data_list
    
    # Find required columns
    slice_label_columns = ['Slice label', 'slice_label', 'Slice_label', 'slice', 'Slice', 'label', 'Label']
    hamming_weight_columns = ['Hamming weight', 'hamming_weight', 'Hamming_weight', 'weight', 'Weight']
    
    slice_col = None
    hamming_col = None
    
    for col in slice_label_columns:
        if col in hamming_df.columns:
            slice_col = col
            break
    
    for col in hamming_weight_columns:
        if col in hamming_df.columns:
            hamming_col = col
            break
    
    if slice_col is None or hamming_col is None:
        print(f"Required columns not found. Available columns: {hamming_df.columns.tolist()}")
        return data_list
    
    print(f"Using slice column: '{slice_col}' and hamming column: '{hamming_col}'")
    
    # Check influence directory
    influence_dir = os.path.join(circuit_dir, 'influence')
    if not os.path.exists(influence_dir):
        print(f"Influence directory not found: {influence_dir}")
        return data_list
    
    # Get all influence files
    influence_files = [f for f in os.listdir(influence_dir) if f.endswith('.csv')]
    print(f"Found {len(influence_files)} influence files: {influence_files}")
    
    # Process each slice
    for _, row in hamming_df.iterrows():
        slice_label = str(row[slice_col]).strip()
        hamming_weight = float(row[hamming_col])
        
        # Find corresponding influence file
        influence_file = None
        
        if f'{slice_label}.csv' in influence_files:
            influence_file = f'{slice_label}.csv'
        else:
            for inf_file in influence_files:
                inf_label = os.path.splitext(inf_file)[0]
                if inf_label == slice_label or slice_label in inf_file:
                    influence_file = inf_file
                    break
        
        if influence_file is None:
            print(f"No matching influence file found for slice '{slice_label}'")
            continue
        
        influence_file_path = os.path.join(influence_dir, influence_file)
        
        # Create data object for this slice
        try:
            data = prepare_slice_data_classification(circuit_graph, influence_file_path, hamming_weight, slice_label)
            if data is not None:
                data.circuit_name = circuit_name
                data_list.append(data)
                print(f"✓ Processed slice '{slice_label}' with {data.num_nodes} nodes, class: {data.y.item()}")
            else:
                print(f"✗ Failed to process slice '{slice_label}'")
        except Exception as e:
            print(f"✗ Error processing slice '{slice_label}': {e}")
    
    return data_list

def load_all_circuit_data(circuits_root_dir):
    """
    Load data from all circuits in the root directory
    """
    all_data = []
    
    circuit_dirs = [d for d in os.listdir(circuits_root_dir) 
                   if os.path.isdir(os.path.join(circuits_root_dir, d))]
    
    print(f"Found {len(circuit_dirs)} circuit directories: {circuit_dirs}")
    
    for circuit_name in circuit_dirs:
        circuit_path = os.path.join(circuits_root_dir, circuit_name)
        print(f"\n{'='*50}")
        print(f"Processing circuit: {circuit_name}")
        print(f"{'='*50}")
        
        circuit_data = process_circuit(circuit_path)
        all_data.extend(circuit_data)
        print(f"Added {len(circuit_data)} slices from {circuit_name}")
    
    print(f"\n{'='*50}")
    print(f"SUMMARY: Total data objects created: {len(all_data)}")
    
    return all_data

def analyze_classification_distribution(all_data):
    """
    Analyze the distribution of classification targets
    """
    class_counts = {}
    ratios = []
    
    for data in all_data:
        class_label = data.y.item()
        class_counts[class_label] = class_counts.get(class_label, 0) + 1
        
        # Calculate ratio for analysis
        try:
            log_hamming = math.log10(data.hamming_weight) if data.hamming_weight > 0 else -float('inf')
            log_total_space = data.input_count * math.log10(2)
            log_ratio = log_hamming - log_total_space
            ratio = 10 ** log_ratio if log_ratio > -float('inf') else 0.0
            ratios.append(ratio)
        except:
            ratios.append(0.0)
    
    print("\nClassification Distribution:")
    for class_id, count in sorted(class_counts.items()):
        percentage = count / len(all_data) * 100
        print(f"Class {class_id}: {count} samples ({percentage:.1f}%)")
    
    print(f"\nRatio statistics:")
    print(f"Min ratio: {min(ratios):.6f}")
    print(f"Max ratio: {max(ratios):.6f}")
    print(f"Mean ratio: {np.mean(ratios):.6f}")
    
    return class_counts


In [18]:
circuits_dir = "c:\\Users\\sidharth\\Documents\\Projects\\boolean-ml\\data\\circuits" 
all_data = load_all_circuit_data(circuits_dir)

Found 10 circuit directories: ['apex2', 'apex4', 'des', 'ex1010', 'ex5', 'i7', 'i8', 'i9', 'k2', 'seq']

Processing circuit: apex2
Using bench file: apex2.bench.txt
Parsed circuit with 649 nodes and 955 edges
Loaded hamming data with columns: ['Slice label', 'Hamming weight', ' Input count', ' HW to Domain size ratio', ' Runtime']
Using slice column: 'Slice label' and hamming column: 'Hamming weight'
Found 3 influence files: ['G39.csv', 'G40.csv', 'G41.csv']

Processing slice G39
Analyzing influence DataFrame with columns: ['Input', 'Influence', ' Runtime']
Found nodes in column 'Input': ['G0', 'G1', 'G2', 'G3', 'G4']...
Mapped 36 nodes out of 36 influence nodes
Extended to 273 nodes (from 36 primary inputs)
Subgraph has 325 edges
✓ Processed slice 'G39' with 273 nodes, class: 1

Processing slice G40
Analyzing influence DataFrame with columns: ['Input', ' Influence', ' Runtime']
Found nodes in column 'Input': ['G0', 'G1', 'G2', 'G3', 'G4']...
Mapped 36 nodes out of 36 influence nodes
E

In [19]:

if len(all_data) > 0:
    # Analyze classification distribution
    class_counts = analyze_classification_distribution(all_data)
        
    # Save processed data
    with open('processed_circuit_data_classification.pkl', 'wb') as f:
        pickle.dump(all_data, f)
        
    # Save metadata
    metadata = {
            'total_samples': len(all_data),
            'num_classes': len(class_counts),
            'class_distribution': class_counts,
            'circuits': list(set([data.circuit_name for data in all_data])),
            'feature_dimensions': all_data[0].x.shape[1],
            'sample_info': {
                'num_nodes_range': (min([data.num_nodes for data in all_data]), 
                                   max([data.num_nodes for data in all_data])),
                'input_count_range': (min([data.input_count for data in all_data]), 
                                     max([data.input_count for data in all_data]))
            }
        }
        
    with open('classification_metadata.pkl', 'wb') as f:
        pickle.dump(metadata, f)
        
    print(f"\n✓ Saved {len(all_data)} processed data objects to 'processed_circuit_data_classification.pkl'")
    print(f"✓ Saved metadata to 'classification_metadata.pkl'")
    print(f"✓ Feature dimensions: {metadata['feature_dimensions']}")
    print(f"✓ Number of classes: {metadata['num_classes']}")
    print("Data preprocessing for classification complete!")
        
else:
    print("No data loaded. Please check your file paths and structure.")


Classification Distribution:
Class 0: 46 samples (7.6%)
Class 1: 30 samples (5.0%)
Class 2: 526 samples (87.4%)

Ratio statistics:
Min ratio: 0.000458
Max ratio: 31904.000000
Mean ratio: 1155.878816

✓ Saved 602 processed data objects to 'processed_circuit_data_classification.pkl'
✓ Saved metadata to 'classification_metadata.pkl'
✓ Feature dimensions: 13
✓ Number of classes: 3
Data preprocessing for classification complete!
