#### A complete Python script that:

- Iterates over all Verilog files in verilog_benchmark_circuits/.
- For each file, parses it (string manipulation; no heavy synthesis tools required).
- Injects one Trojan (trigger + payload) chosen from a set of templates:
    - AND-trigger + XOR payload (classic Trust-Hub style).
    - Counter trigger + MUX payload.
    - FSM trigger + OR payload (optional advanced).

- Saves the trojanized netlists into:
    - Trojanized_ISCAS_EPFL_andxor/
    - Trojanized_ISCAS_EPFL_countermux/
    - Trojanized_ISCAS_EPFL_fsmor/
    
How this works: 
- Reads every .v file in verilog_benchmark_circuits/.
- Picks a victim net (heuristic: random LHS of an assign).
- Picks 3 random signals as trigger inputs.
- Injects Trojan code before endmodule.
- Saves into separate folders per trigger type:
    - Trojanized_ISCAS_EPFL_andxor/
    - Trojanized_ISCAS_EPFL_countermux/
    - Trojanized_ISCAS_EPFL_fsmor/

In [1]:
import os
import re
import random
from pathlib import Path

# Input and output dirs
SRC_DIR = Path("verilog_benchmark_circuits")
OUT_DIR = Path("Trojanized_ISCAS_EPFL")
OUT_DIR.mkdir(exist_ok=True)

# Trojan templates (Trust-Hub style simplified)
TEMPLATES = {
    "andxor": """
// ---------------- Trojan: AND-trigger + XOR payload ----------------
wire trg_and_{tid} = {trigger_expr};

// Payload: XOR victim with trigger
wire {victim}_troj_{tid} = {victim} ^ trg_and_{tid};
assign {victim} = {victim}_troj_{tid};
""",

    "countermux": """
// ---------------- Trojan: Counter-trigger + MUX payload ----------------
reg [7:0] troj_cnt_{tid};
always @(posedge clk or negedge rst_n) begin
    if (!rst_n) troj_cnt_{tid} <= 0;
    else        troj_cnt_{tid} <= troj_cnt_{tid} + 1'b1;
end

wire troj_trig_{tid} = (troj_cnt_{tid} == 8'hA7);

// Payload: MUX override when triggered
wire {victim}_troj_{tid} = troj_trig_{tid} ? 1'b0 : {victim};
assign {victim} = {victim}_troj_{tid};
""",

    "fsmor": """
// ---------------- Trojan: FSM-trigger + OR payload ----------------
reg [1:0] troj_state_{tid};
always @(posedge clk or negedge rst_n) begin
    if (!rst_n) troj_state_{tid} <= 2'b00;
    else case(troj_state_{tid})
        2'b00: troj_state_{tid} <= 2'b01;
        2'b01: troj_state_{tid} <= 2'b10;
        2'b10: troj_state_{tid} <= 2'b11;
        2'b11: troj_state_{tid} <= 2'b00;
    endcase
end

wire troj_trig_{tid} = (troj_state_{tid} == 2'b11);

// Payload: OR injection
wire {victim}_troj_{tid} = {victim} | troj_trig_{tid};
assign {victim} = {victim}_troj_{tid};
"""
}

# Utility: pick a victim net (heuristic = first assign)
def pick_victim_net(verilog_src):
    assigns = re.findall(r"assign\s+(\w+)\s*=", verilog_src)
    if not assigns:
        return "out"  # fallback
    return random.choice(assigns)

# Utility: pick random trigger signals
def pick_trigger_nets(verilog_src, k=3):
    wires = re.findall(r"wire\s+(\w+)", verilog_src)
    if len(wires) < k:
        return ["dummy"] * k
    return random.sample(wires, k)

# Trojan insertion
def inject_trojan(verilog_src, template_key, tid=0):
    victim = pick_victim_net(verilog_src)
    triggers = pick_trigger_nets(verilog_src, k=3)
    trigger_expr = " & ".join(triggers)

    trojan_code = TEMPLATES[template_key].format(
        tid=tid, victim=victim, trigger_expr=trigger_expr
    )

    # Insert Trojan before endmodule
    trojanized = verilog_src.replace("endmodule", trojan_code + "\nendmodule")
    return trojanized

# Main
def main():
    for vfile in SRC_DIR.glob("*.v"):
        with open(vfile, "r") as f:
            src = f.read()

        for tkey in TEMPLATES.keys():
            outdir = OUT_DIR.parent / f"Trojanized_ISCAS_EPFL_{tkey}"
            outdir.mkdir(exist_ok=True)

            troj_src = inject_trojan(src, tkey, tid=0)
            outpath = outdir / vfile.name.replace(".v", f"__trojan_{tkey}.v")

            with open(outpath, "w") as fout:
                fout.write(troj_src)

            print(f"[+] Trojan {tkey} injected in {vfile.name} -> {outpath}")

if __name__ == "__main__":
    main()


[+] Trojan andxor injected in Priority.v -> Trojanized_ISCAS_EPFL_andxor/Priority__trojan_andxor.v
[+] Trojan countermux injected in Priority.v -> Trojanized_ISCAS_EPFL_countermux/Priority__trojan_countermux.v
[+] Trojan fsmor injected in Priority.v -> Trojanized_ISCAS_EPFL_fsmor/Priority__trojan_fsmor.v
[+] Trojan andxor injected in adder.v -> Trojanized_ISCAS_EPFL_andxor/adder__trojan_andxor.v
[+] Trojan countermux injected in adder.v -> Trojanized_ISCAS_EPFL_countermux/adder__trojan_countermux.v
[+] Trojan fsmor injected in adder.v -> Trojanized_ISCAS_EPFL_fsmor/adder__trojan_fsmor.v
[+] Trojan andxor injected in arbiter.v -> Trojanized_ISCAS_EPFL_andxor/arbiter__trojan_andxor.v
[+] Trojan countermux injected in arbiter.v -> Trojanized_ISCAS_EPFL_countermux/arbiter__trojan_countermux.v
[+] Trojan fsmor injected in arbiter.v -> Trojanized_ISCAS_EPFL_fsmor/arbiter__trojan_fsmor.v
[+] Trojan andxor injected in bar.v -> Trojanized_ISCAS_EPFL_andxor/bar__trojan_andxor.v
[+] Trojan counte

In [2]:
import networkx as nx
import pandas as pd
import os
import re
from networkx.exception import NetworkXNoPath

# Gate-level primitives we care about
GATE_TYPES = ['and', 'or', 'nand', 'nor', 'xor', 'xnor', 'buf', 'not']

# Regex patterns
gate_pattern = re.compile(rf'\s*({"|".join(GATE_TYPES)})\s+(\w+)\s*\((.*)\);')
assign_pattern = re.compile(r"\s*assign\s+(\w+)\s*=\s*(.*);")
always_pattern = re.compile(r"\s*always\s*@")
fsm_pattern = re.compile(r"troj_state", re.IGNORECASE)
counter_pattern = re.compile(r"troj_cnt", re.IGNORECASE)

def parse_verilog_netlist(verilog_file, label="clean"):
    """
    Parses a Verilog netlist (gate-level + Trojan behavioral code).
    Returns the graph, gate dictionary, primary inputs, primary outputs, and label.
    """
    G = nx.DiGraph()
    gates = {}
    all_signals = set()
    output_signals = set()
    input_signals = set()

    with open(verilog_file, 'r') as f:
        lines = f.readlines()

    for count_line, line in enumerate(lines):
        line = line.strip()

        # 1. Match standard gate instantiations
        match = gate_pattern.match(line)
        if match:
            gate_type, gate_name, connection_str = match.groups()
            signals = [sig.strip() for sig in connection_str.split(',')]
            output_signal = signals[0]
            input_signals.update(signals[1:])

            all_signals.update(signals)
            output_signals.add(output_signal)

            gates[output_signal] = (gate_name, gate_type, signals[1:])
            G.add_node(gate_name, type=gate_type)
            continue

        # 2. Handle assigns as pseudo-gates
        match = assign_pattern.match(line)
        if match:
            lhs, rhs = match.groups()
            gates[lhs] = (f"assign_{lhs}", "assign", [rhs])
            G.add_node(f"assign_{lhs}", type="assign")
            continue

        # 3. Handle behavioral constructs (Trojan FSMs, counters, always)
        if always_pattern.match(line):
            node_name = f"always_blk_{count_line}"
            G.add_node(node_name, type="always")
            continue

        if fsm_pattern.search(line):
            node_name = f"fsm_trigger_{count_line}"
            G.add_node(node_name, type="fsm_trigger")
            continue

        if counter_pattern.search(line):
            node_name = f"counter_trigger_{count_line}"
            G.add_node(node_name, type="counter_trigger")
            continue

    primary_inputs = input_signals - output_signals
    primary_outputs = output_signals - input_signals

    # Add edges
    for output_signal, (gate_name, gate_type, inputs) in gates.items():
        for inp in inputs:
            if inp in gates:
                G.add_edge(gates[inp][0], gate_name)
            elif inp in primary_inputs:
                G.add_edge(inp, gate_name)

    # Add PI and PO nodes
    for pi in primary_inputs:
        G.add_node(pi, type="input")

    for po in primary_outputs:
        G.add_node(po, type="output")
        if po in gates:
            G.add_edge(gates[po][0], po)

    return G, gates, primary_inputs, primary_outputs, label


def extract_features(G, gates, primary_inputs, primary_outputs, circuit_name, label):
    """
    Extracts node-level features from the graph.
    """
    features = []
    degrees = dict(G.degree())

    centrality_deg = nx.degree_centrality(G)
    centrality_betw = nx.betweenness_centrality(G, k=min(100, len(G)))
    centrality_close = nx.closeness_centrality(G)
    clustering_coeff = nx.clustering(G)

    for node in G.nodes:
        node_type = G.nodes[node].get("type", "unknown")
        is_primary_input = 1 if node in primary_inputs else 0
        is_primary_output = 1 if node in primary_outputs else 0
        is_internal_node = 1 if not (is_primary_input or is_primary_output) else 0

        fan_in = G.in_degree(node)
        fan_out = G.out_degree(node)

        neighbors = list(G.neighbors(node))
        avg_fan_in_neighbors = sum(G.in_degree(n) for n in neighbors) / len(neighbors) if neighbors else 0
        avg_fan_out_neighbors = sum(G.out_degree(n) for n in neighbors) / len(neighbors) if neighbors else 0

        try:
            distance_to_outputs = min(nx.shortest_path_length(G, source=node, target=po)
                                      for po in primary_outputs if nx.has_path(G, node, po)) \
                                      if is_internal_node else 0
        except (NetworkXNoPath, ValueError):
            distance_to_outputs = -1

        is_key_gate = 1 if node_type in ['xor', 'xnor'] else 0

        features.append({
            'circuit_name': circuit_name,
            'node': node,
            'gate_type': node_type,
            'fan_in': fan_in,
            'fan_out': fan_out,
            'is_primary_input': is_primary_input,
            'is_primary_output': is_primary_output,
            'is_internal': is_internal_node,
            'is_key_gate': is_key_gate,
            'degree_centrality': centrality_deg.get(node, 0),
            'betweenness_centrality': centrality_betw.get(node, 0),
            'closeness_centrality': centrality_close.get(node, 0),
            'clustering_coefficient': clustering_coeff.get(node, 0),
            'avg_fan_in_neighbors': avg_fan_in_neighbors,
            'avg_fan_out_neighbors': avg_fan_out_neighbors,
            'dist_to_output': distance_to_outputs,
            'label': label,  # clean vs trojan
        })

    return pd.DataFrame(features)


def process_all_netlists(folder_path, label="clean", out_csv="features.csv"):
    """
    Process all netlists in a folder and save extracted features to CSV.
    """
    all_features = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".v"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing {filename} [{label}]")

            try:
                G, gates, primary_inputs, primary_outputs, lbl = parse_verilog_netlist(file_path, label)
                features_df = extract_features(G, gates, primary_inputs, primary_outputs, filename, lbl)
                all_features.append(features_df)
            except Exception as e:
                print(f"⚠️ Skipping {filename} due to error: {e}")

    if all_features:
        final_df = pd.concat(all_features, ignore_index=True)
        final_df.to_csv(out_csv, index=False)
        print(f"✅ Features saved to {out_csv}")


if __name__ == "__main__":
    # Example usage: process clean & Trojan netlists
    process_all_netlists("verilog_benchmark_circuits", label="clean", out_csv="features_clean.csv")
    process_all_netlists("Trojanized_ISCAS_EPFL_andxor", label="trojan", out_csv="features_trojan_andxor.csv")
    process_all_netlists("Trojanized_ISCAS_EPFL_countermux", label="trojan", out_csv="features_trojan_countermux.csv")
    process_all_netlists("Trojanized_ISCAS_EPFL_fsmor", label="trojan", out_csv="features_trojan_fsmor.csv")


Processing Priority.v [clean]
Processing adder.v [clean]
Processing arbiter.v [clean]
Processing bar.v [clean]
Processing c1355.v [clean]
Processing c17.v [clean]
Processing c1908.v [clean]
Processing c2670.v [clean]
Processing c3540.v [clean]
Processing c432.v [clean]
Processing c499.v [clean]
Processing c5315.v [clean]
Processing c6288.v [clean]
Processing c7552.v [clean]
Processing c880.v [clean]
Processing cavlc.v [clean]
Processing ctrl.v [clean]
Processing dec.v [clean]
Processing i2c.v [clean]
Processing int2float.v [clean]
Processing max.v [clean]
Processing router.v [clean]
Processing sin.v [clean]
✅ Features saved to features_clean.csv
Processing Priority__trojan_andxor.v [trojan]
Processing adder__trojan_andxor.v [trojan]
Processing arbiter__trojan_andxor.v [trojan]
Processing bar__trojan_andxor.v [trojan]
Processing c1355__trojan_andxor.v [trojan]
Processing c17__trojan_andxor.v [trojan]
Processing c1908__trojan_andxor.v [trojan]
Processing c2670__trojan_andxor.v [trojan]
P

#### Mergining all the csvs into a single csv file for node, another for subgraph, and the thrid csv for subgraph classification.

In [3]:
import os
import re
import math
import json
import random
import networkx as nx
import pandas as pd
from typing import Tuple, Dict, Set, List
from networkx.exception import NetworkXNoPath

# -----------------------------
# Configuration
# -----------------------------
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Folders (adjust as needed)
CLEAN_DIR = "verilog_benchmark_circuits"
TROJ_DIRS = [
    "Trojanized_ISCAS_EPFL_andxor",
    "Trojanized_ISCAS_EPFL_countermux",
    "Trojanized_ISCAS_EPFL_fsmor",
]

OUT_DIR = "GNNDatasets"
os.makedirs(OUT_DIR, exist_ok=True)

# Subgraph extraction radius (hops)
K_HOPS = 2

# Betweenness sampling to keep things fast on large graphs
BETWEENNESS_SAMPLE_CAP = 500  # nodes
# If the graph is bigger than this, we sample k nodes for betweenness

# Primitive gates we map to nodes
GATE_TYPES = ['and', 'or', 'nand', 'nor', 'xor', 'xnor', 'buf', 'not']

# Regex patterns
gate_pattern = re.compile(rf'\s*({"|".join(GATE_TYPES)})\s+(\w+)\s*\((.*)\);')
assign_pattern = re.compile(r"\s*assign\s+(\w+)\s*=\s*(.*);")
always_pattern = re.compile(r"\s*always\s*@")

# We heuristically detect Trojan-related textual cues
FSM_HINT = re.compile(r"(troj_|_troj|trojstate|troj_state|state_)", re.IGNORECASE)
COUNTER_HINT = re.compile(r"(troj_cnt|trojcount|counter|cnt_)", re.IGNORECASE)

# Node types we treat as "trigger-like" regardless of folder (behavioral Trojans)
BASE_TRIGGER_NODE_TYPES = {"fsm_trigger", "counter_trigger", "always"}

# In the AND/XOR template set, XOR/XNOR often appear in triggers
ANDXOR_EXTRA_TRIGGERS = {"xor", "xnor"}


# -----------------------------
# Parsing & Graph Building
# -----------------------------
def parse_verilog_netlist(verilog_file: str,
                          folder_hint: str,
                          label: int) -> Tuple[nx.DiGraph, Dict[str, Tuple[str, str, List[str]]],
                                               Set[str], Set[str], int]:
    """
    Parse a mixed (gate-level + behavioral) Verilog file into a directed graph.
    Returns (graph, gates_dict, primary_inputs, primary_outputs, graph_label).
    """
    G = nx.DiGraph()
    gates = {}
    output_signals = set()
    input_signals = set()

    with open(verilog_file, "r", errors="ignore") as f:
        lines = f.readlines()

    for idx, line in enumerate(lines):
        s = line.strip()
        if not s or s.startswith("//"):
            continue

        # 1) Gate-level instantiation
        m = gate_pattern.match(s)
        if m:
            gate_type, gate_name, connection_str = m.groups()
            # Split signals inside "(...)" by comma, strip whitespace
            signals = [sig.strip() for sig in connection_str.split(',')]
            if not signals:
                continue
            out_sig = signals[0]
            in_sigs = signals[1:]

            input_signals.update(in_sigs)
            output_signals.add(out_sig)

            gates[out_sig] = (gate_name, gate_type, in_sigs)
            G.add_node(gate_name, type=gate_type)
            continue

        # 2) assign statements -> pseudo node
        m = assign_pattern.match(s)
        if m:
            lhs, rhs = m.groups()
            n = f"assign_{lhs}"
            gates[lhs] = (n, "assign", [rhs])
            G.add_node(n, type="assign")
            continue

        # 3) behavioral 'always' -> coarse node
        if always_pattern.match(s):
            n = f"always_blk_{idx}"
            G.add_node(n, type="always")
            continue

        # 4) heuristics for FSM/counter textual hints (often spread over multiple lines)
        if FSM_HINT.search(s):
            G.add_node(f"fsm_trigger_{idx}", type="fsm_trigger")
        if COUNTER_HINT.search(s):
            G.add_node(f"counter_trigger_{idx}", type="counter_trigger")

    # PI/PO estimation
    primary_inputs = input_signals - output_signals
    primary_outputs = output_signals - input_signals

    # Wire up gate graph edges (PIs feed into first gates)
    for out_sig, (gname, gtype, in_sigs) in gates.items():
        for inp in in_sigs:
            if inp in gates:
                src = gates[inp][0]
                G.add_edge(src, gname)
            elif inp in primary_inputs:
                G.add_edge(inp, gname)

    # Materialize PIs/POs as nodes
    for pi in primary_inputs:
        G.add_node(pi, type="input")
    for po in primary_outputs:
        G.add_node(po, type="output")
        if po in gates:
            G.add_edge(gates[po][0], po)

    # Attach graph-level label
    G.graph["label"] = int(label)
    G.graph["circuit_name"] = os.path.basename(verilog_file)
    G.graph["folder_hint"] = folder_hint
    return G, gates, primary_inputs, primary_outputs, label


# -----------------------------
# Feature Extraction
# -----------------------------
def safe_betweenness(G: nx.DiGraph) -> Dict[str, float]:
    n = G.number_of_nodes()
    if n <= BETWEENNESS_SAMPLE_CAP:
        return nx.betweenness_centrality(G, normalized=True)
    # Sample nodes for speed
    k = min(BETWEENNESS_SAMPLE_CAP, n)
    sample = random.sample(list(G.nodes()), k)
    return nx.betweenness_centrality(G, k=k, normalized=True, seed=RANDOM_SEED)


def compute_node_features(G: nx.DiGraph,
                          primary_inputs: Set[str],
                          primary_outputs: Set[str]) -> pd.DataFrame:
    deg_cent = nx.degree_centrality(G)
    bet_cent = safe_betweenness(G)
    clo_cent = nx.closeness_centrality(G)
    clust = nx.clustering(G.to_undirected(), nodes=None)

    rows = []
    for node in G.nodes():
        ntype = G.nodes[node].get("type", "unknown")
        is_pi = 1 if node in primary_inputs else 0
        is_po = 1 if node in primary_outputs else 0
        is_internal = 1 - max(is_pi, is_po)

        fan_in = G.in_degree(node)
        fan_out = G.out_degree(node)

        # distance to nearest PO (only for internal nodes)
        dist_to_po = 0
        if is_internal:
            dvals = []
            for po in primary_outputs:
                if nx.has_path(G, node, po):
                    try:
                        dvals.append(nx.shortest_path_length(G, node, po))
                    except Exception:
                        pass
            if dvals:
                dist_to_po = min(dvals)
            else:
                dist_to_po = -1  # unreachable

        # neighbors fanin/fanout averages
        neigh = list(G.neighbors(node))
        avg_fanin_neigh = sum(G.in_degree(n) for n in neigh) / len(neigh) if neigh else 0.0
        avg_fanout_neigh = sum(G.out_degree(n) for n in neigh) / len(neigh) if neigh else 0.0

        rows.append({
            "node": node,
            "gate_type": ntype,
            "fan_in": fan_in,
            "fan_out": fan_out,
            "is_primary_input": is_pi,
            "is_primary_output": is_po,
            "is_internal": is_internal,
            "degree_centrality": deg_cent.get(node, 0.0),
            "betweenness_centrality": bet_cent.get(node, 0.0),
            "closeness_centrality": clo_cent.get(node, 0.0),
            "clustering_coefficient": clust.get(node, 0.0),
            "avg_fan_in_neighbors": avg_fanin_neigh,
            "avg_fan_out_neighbors": avg_fanout_neigh,
            "dist_to_output": dist_to_po,
        })
    return pd.DataFrame(rows)


# -----------------------------
# Trojan Neighborhood Tagging
# -----------------------------
def trojan_node_types(folder_hint: str) -> Set[str]:
    t = set(BASE_TRIGGER_NODE_TYPES)
    if "andxor" in folder_hint.lower():
        t |= ANDXOR_EXTRA_TRIGGERS
    return t


def mark_trojan_neighborhood(G: nx.DiGraph, folder_hint: str, K: int = 2) -> Set[str]:
    """
    Return set of nodes considered in the Trojan neighborhood:
    nodes whose type is trigger-like, plus nodes within K hops (undirected) of any trigger node.
    """
    trigger_types = trojan_node_types(folder_hint)
    trigger_nodes = {n for n, d in G.nodes(data=True) if d.get("type") in trigger_types}

    if not trigger_nodes:
        return set()

    UG = G.to_undirected()
    marked = set(trigger_nodes)
    for t in trigger_nodes:
        # Gather nodes within K hops using BFS layers
        frontier = {t}
        visited = {t}
        for _ in range(K):
            nxt = set()
            for u in frontier:
                nxt |= set(UG.neighbors(u))
            nxt -= visited
            visited |= nxt
            frontier = nxt
        marked |= visited
    return marked


# -----------------------------
# Dataset Writers
# -----------------------------
def process_folder(folder: str, label: int,
                   node_rows: List[dict],
                   subgraph_rows: List[dict],
                   graph_rows: List[dict]) -> None:
    files = [f for f in os.listdir(folder) if f.endswith(".v")]
    for fname in files:
        fpath = os.path.join(folder, fname)
        try:
            G, gates, PIs, POs, glabel = parse_verilog_netlist(fpath, folder, label)
            if G.number_of_nodes() == 0:
                print(f"⚠️ Empty graph (skipping): {fpath}")
                continue

            # Node features
            nfeat = compute_node_features(G, PIs, POs)

            # Trojan neighborhood
            tnodes = set()
            if glabel == 1:
                tnodes = mark_trojan_neighborhood(G, folder, K_HOPS)

            # ----- Node-level rows -----
            for _, row in nfeat.iterrows():
                node_rows.append({
                    "circuit_name": G.graph.get("circuit_name", fname),
                    "folder": folder,
                    "node": row["node"],
                    "gate_type": row["gate_type"],
                    "label_graph": glabel,                    # 0 clean, 1 trojan
                    "label_node": 1 if row["node"] in tnodes else 0,
                    **{k: row[k] for k in [
                        "fan_in","fan_out","is_primary_input","is_primary_output","is_internal",
                        "degree_centrality","betweenness_centrality","closeness_centrality",
                        "clustering_coefficient","avg_fan_in_neighbors","avg_fan_out_neighbors",
                        "dist_to_output"
                    ]}
                })

            # ----- Subgraph-level rows -----
            # For Trojan designs: one subgraph per trigger neighborhood (centered at each trigger node)
            # For clean designs: sample |trigger-like|≈3 centers (PIs/POs/middle nodes) as negatives.
            UG = G.to_undirected()
            trigger_like = {n for n, d in G.nodes(data=True)
                            if d.get("type") in trojan_node_types(folder)}
            sub_id = 0
            if glabel == 1 and trigger_like:
                for center in trigger_like:
                    nodes_k = nx.ego_graph(UG, center, radius=K_HOPS).nodes()
                    sub_nodes = list(nodes_k)
                    # Aggregate simple stats
                    sub_n = len(sub_nodes)
                    sub_e = G.subgraph(sub_nodes).number_of_edges()
                    degs = [G.degree(n) for n in sub_nodes]
                    sub_avg_deg = sum(degs)/sub_n if sub_n else 0.0
                    sub_types = [G.nodes[n].get("type","unknown") for n in sub_nodes]
                    type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()

                    subgraph_rows.append({
                        "circuit_name": G.graph.get("circuit_name", fname),
                        "folder": folder,
                        "subgraph_id": f"{fname}__{sub_id}",
                        "center_node": center,
                        "label_subgraph": 1,
                        "num_nodes": sub_n,
                        "num_edges": sub_e,
                        "avg_degree": sub_avg_deg,
                        "type_hist_json": json.dumps(type_hist),
                    })
                    sub_id += 1
            else:
                # Clean: produce 3 negative subgraphs from diverse anchors (PI, PO, internal)
                anchors = []
                pis = list(PIs)[:3]
                pos = list(POs)[:3]
                internals = [n for n in G.nodes()
                             if n not in PIs and n not in POs]
                random.shuffle(internals)
                anchors.extend(pis)
                anchors.extend(pos)
                anchors.extend(internals[:max(0, 3-len(anchors))])
                if not anchors and internals:
                    anchors = internals[:3]
                anchors = anchors[:3]

                for center in anchors:
                    nodes_k = nx.ego_graph(UG, center, radius=K_HOPS).nodes()
                    sub_nodes = list(nodes_k)
                    sub_n = len(sub_nodes)
                    sub_e = G.subgraph(sub_nodes).number_of_edges()
                    degs = [G.degree(n) for n in sub_nodes]
                    sub_avg_deg = sum(degs)/sub_n if sub_n else 0.0
                    sub_types = [G.nodes[n].get("type","unknown") for n in sub_nodes]
                    type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()

                    subgraph_rows.append({
                        "circuit_name": G.graph.get("circuit_name", fname),
                        "folder": folder,
                        "subgraph_id": f"{fname}__{sub_id}",
                        "center_node": center,
                        "label_subgraph": 0,
                        "num_nodes": sub_n,
                        "num_edges": sub_e,
                        "avg_degree": sub_avg_deg,
                        "type_hist_json": json.dumps(type_hist),
                    })
                    sub_id += 1

            # ----- Graph-level rows -----
            # Global structural summary + label
            n = G.number_of_nodes()
            m = G.number_of_edges()
            avg_deg = (2*m / n) if n else 0.0
            in_deg = [G.in_degree(v) for v in G.nodes()]
            out_deg = [G.out_degree(v) for v in G.nodes()]
            avg_in = sum(in_deg)/n if n else 0.0
            avg_out = sum(out_deg)/n if n else 0.0
            frac_xor = len([1 for _, d in G.nodes(data=True) if d.get("type") == "xor"]) / n if n else 0.0
            frac_xnor = len([1 for _, d in G.nodes(data=True) if d.get("type") == "xnor"]) / n if n else 0.0
            frac_assign = len([1 for _, d in G.nodes(data=True) if d.get("type") == "assign"]) / n if n else 0.0
            frac_behav = len([1 for _, d in G.nodes(data=True)
                              if d.get("type") in {"always","fsm_trigger","counter_trigger"}]) / n if n else 0.0

            graph_rows.append({
                "circuit_name": G.graph.get("circuit_name", fname),
                "folder": folder,
                "label_graph": glabel,
                "num_nodes": n,
                "num_edges": m,
                "avg_degree": avg_deg,
                "avg_in_degree": avg_in,
                "avg_out_degree": avg_out,
                "frac_xor": frac_xor,
                "frac_xnor": frac_xnor,
                "frac_assign": frac_assign,
                "frac_behavioral": frac_behav,
            })

        except Exception as e:
            print(f"⚠️ Skipping {fpath} due to error: {e}")


# -----------------------------
# Main Orchestration
# -----------------------------
def main():
    node_rows, subgraph_rows, graph_rows = [], [], []

    # Clean first
    if os.path.isdir(CLEAN_DIR):
        print(f"Processing CLEAN: {CLEAN_DIR}")
        process_folder(CLEAN_DIR, label=0, node_rows=node_rows,
                       subgraph_rows=subgraph_rows, graph_rows=graph_rows)
    else:
        print(f"⚠️ Clean directory not found: {CLEAN_DIR}")

    # Trojanized sets
    for tdir in TROJ_DIRS:
        if os.path.isdir(tdir):
            print(f"Processing TROJAN: {tdir}")
            process_folder(tdir, label=1, node_rows=node_rows,
                           subgraph_rows=subgraph_rows, graph_rows=graph_rows)
        else:
            print(f"⚠️ Trojan directory not found: {tdir}")

    # Save CSVs
    if node_rows:
        pd.DataFrame(node_rows).to_csv(os.path.join(OUT_DIR, "node.csv"), index=False)
        print(f"✅ Saved node-level dataset → {os.path.join(OUT_DIR, 'node.csv')}")
    else:
        print("❌ No node rows generated.")

    if subgraph_rows:
        pd.DataFrame(subgraph_rows).to_csv(os.path.join(OUT_DIR, "subgraph.csv"), index=False)
        print(f"✅ Saved subgraph-level dataset → {os.path.join(OUT_DIR, 'subgraph.csv')}")
    else:
        print("❌ No subgraph rows generated.")

    if graph_rows:
        pd.DataFrame(graph_rows).to_csv(os.path.join(OUT_DIR, "graph.csv"), index=False)
        print(f"✅ Saved graph-level dataset → {os.path.join(OUT_DIR, 'graph.csv')}")
    else:
        print("❌ No graph rows generated.")


if __name__ == "__main__":
    main()


Processing CLEAN: verilog_benchmark_circuits


  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=Tr

Processing TROJAN: Trojanized_ISCAS_EPFL_andxor


  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=Tr

Processing TROJAN: Trojanized_ISCAS_EPFL_countermux


  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=Tr

Processing TROJAN: Trojanized_ISCAS_EPFL_fsmor


  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=True).to_dict()
  type_hist = pd.value_counts(pd.Series(sub_types), normalize=Tr

✅ Saved node-level dataset → GNNDatasets/node.csv
✅ Saved subgraph-level dataset → GNNDatasets/subgraph.csv
✅ Saved graph-level dataset → GNNDatasets/graph.csv


#### Parsing edges as well. 

In [8]:
# build_edges_datasets.py
import os, re, csv
import pandas as pd

# ---------------------------
# Utility: lightweight Verilog tokenizer helpers
# ---------------------------
RE_COMMENT_LINE = re.compile(r"//.*$")
RE_COMMENT_BLOCK = re.compile(r"/\*.*?\*/", re.S)
RE_WS = re.compile(r"\s+")
PRIM_GATES = {"and","or","nand","nor","xor","xnor","buf","not"}

# Common output port names in cell libraries
COMMON_OUT_PORTS = {"Y","Z","ZN","Q","QN","O","S","OUT"}

IDENT_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_\[\]\:]*")

def strip_comments(text: str) -> str:
    text = RE_COMMENT_BLOCK.sub("", text)
    text = "\n".join(RE_COMMENT_LINE.sub("", ln) for ln in text.splitlines())
    return text

def extract_identifiers(expr: str):
    # pull signal-like tokens from an expression (ignore numbers)
    ids = set(IDENT_RE.findall(expr))
    return {i for i in ids if not re.fullmatch(r"\d+[']?[bhd]?\w*", i)}

# ---------------------------
# Core parser
# ---------------------------
def parse_verilog_graph_edges(verilog_path):
    """
    Parse gate-level + simple behavioral Verilog to build a directed graph:
      - nodes: gate instances, pseudo-assigns, pseudo-DFFs, PI/PO nodes (signal names)
      - edges: driver_node -> consumer_node
    Returns: nodes_info(dict name->type), edges(list of (src,dst)), module_io(dict with 'inputs','outputs')
    """
    with open(verilog_path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    code = strip_comments(raw)

    # Collect IO declarations
    inputs, outputs = set(), set()

    # Match module header port declarations too (optional)
    for m in re.finditer(r"\b(input|output|inout)\b([^;]*);", code):
        direction = m.group(1)
        decl = m.group(2)
        # remove ranges like [7:0]
        decl = re.sub(r"\[[^\]]+\]", " ", decl)
        # split by commas
        ids = [t.strip() for t in decl.replace("\n", " ").split(",")]
        ids = [IDENT_RE.search(x).group(0) for x in ids if IDENT_RE.search(x)]
        if direction == "input":
            inputs.update(ids)
        elif direction == "output":
            outputs.update(ids)
        else:
            # ignore inout for now or treat as both
            inputs.update(ids); outputs.update(ids)

    # Data structures
    nodes = {}            # node_name -> type (e.g., 'and','assign','dff','input','output', etc.)
    drives = {}           # signal -> driver node_name
    uses = {}             # node_name -> list of input signals (to be resolved to driver nodes later)
    signal_consumers = {} # signal -> set of consumer node_names

    def add_node(n, t):
        if n not in nodes:
            nodes[n] = t

    def add_consumer(sig, node):
        signal_consumers.setdefault(sig, set()).add(node)

    # ---------------------------
    # 1) Primitive gate instances: e.g., and U1 (out, a, b);
    # ---------------------------
    prim_pat = re.compile(rf"\b({'|'.join(PRIM_GATES)})\s+([A-Za-z_]\w*)\s*\(([^;]*)\)\s*;", re.I)
    for gtype, inst, conn in prim_pat.findall(code):
        gtype = gtype.lower()
        add_node(inst, gtype)
        # positional: first is output, rest inputs
        sigs = [s.strip() for s in conn.split(",") if s.strip()]
        if not sigs: 
            continue
        out_sig = sigs[0]
        in_sigs = sigs[1:]

        # record driver
        drives[out_sig] = inst
        # record inputs for this instance
        uses[inst] = in_sigs
        for s in in_sigs:
            add_consumer(s, inst)

    # ---------------------------
    # 2) Named-port cell instantiations: e.g., NAND2X1 U2 (.Y(out), .A(a), .B(b));
    # ---------------------------
    named_inst_pat = re.compile(r"\b([A-Za-z_]\w*)\s+([A-Za-z_]\w*)\s*\(([^;]*)\)\s*;", re.S)
    for cell, inst, blob in named_inst_pat.findall(code):
        # Skip those already parsed as primitive (avoid double)
        if inst in nodes:
            continue
        # collect .PORT(sig)
        ports = re.findall(r"\.\s*([A-Za-z_]\w*)\s*\(\s*([^\)]+)\s*\)", blob)
        if not ports:
            continue
        add_node(inst, cell.lower())
        # Decide output ports (heuristic)
        out_ports = [p for p,_ in ports if p.upper() in COMMON_OUT_PORTS]
        if out_ports:
            # assign first found output as the driver signal
            out_p = out_ports[0]
            out_sig = next(sig for p,sig in ports if p == out_p)
            out_sig = out_sig.strip()
            drives[out_sig] = inst
        # Inputs are all the other connected signals (non-output)
        in_sigs = [sig.strip() for p,sig in ports if p.upper() not in COMMON_OUT_PORTS]
        if in_sigs:
            uses[inst] = in_sigs
            for s in in_sigs:
                add_consumer(s, inst)

    # ---------------------------
    # 3) Continuous assigns: assign lhs = rhs;
    #    Create pseudo node ASSIGN_<lhs> -> drives lhs
    # ---------------------------
    for lhs, rhs in re.findall(r"\bassign\s+([A-Za-z_]\w*)\s*=\s*(.*?);", code):
        pn = f"ASSIGN_{lhs}"
        add_node(pn, "assign")
        drives[lhs] = pn
        # dependencies from identifiers in rhs
        rhs_ids = list(extract_identifiers(rhs))
        uses[pn] = rhs_ids
        for s in rhs_ids:
            add_consumer(s, pn)

    # ---------------------------
    # 4) Simple always blocks with nonblocking assignments: lhs <= rhs;
    #    Create pseudo DFF_lhs that drives lhs; inputs from rhs identifiers.
    # ---------------------------
    # Capture 'always ... begin ... end' blocks coarsely
    for block in re.findall(r"\balways\b.*?begin(.*?)end", code, flags=re.S|re.I):
        for lhs, rhs in re.findall(r"([A-Za-z_]\w*)\s*<=\s*(.*?);", block):
            pn = f"DFF_{lhs}"
            add_node(pn, "dff")
            drives[lhs] = pn
            rhs_ids = list(extract_identifiers(rhs))
            uses[pn] = rhs_ids
            for s in rhs_ids:
                add_consumer(s, pn)

    # ---------------------------
    # Build edges by resolving drivers for every consumer's input signals
    # ---------------------------
    edges = []

    # Create PI nodes for undriven signals that are consumed
    consumed_signals = set(signal_consumers.keys())
    for sig in consumed_signals:
        if sig not in drives:
            # mark as PI
            add_node(sig, "input")
            drives[sig] = sig  # self-node acts as source

    # Wire driver -> consumer for all inputs
    for consumer, in_sigs in uses.items():
        for sig in in_sigs:
            if sig in drives:
                src = drives[sig]
                edges.append((src, consumer))
            else:
                # Shouldn't happen after PI backfill, but keep safe
                add_node(sig, "input")
                edges.append((sig, consumer))

    # Wire drivers to primary outputs (PO nodes)
    for po in outputs:
        add_node(po, "output")
        if po in drives:
            edges.append((drives[po], po))
        else:
            # un-driven output → connect from itself (dangling), or skip
            # here we leave it as a standalone PO node
            pass

    # Ensure all declared inputs exist as nodes
    for pi in inputs:
        add_node(pi, "input")
        # (edges from PI to consumers already added above if consumed)

    module_io = {"inputs": inputs, "outputs": outputs}
    return nodes, edges, module_io

# ---------------------------
# Dataset writers
# ---------------------------
def edges_to_csv(edges, circuit_name, out_path):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "a", newline="") as f:
        w = csv.writer(f)
        # header if new file
        if f.tell() == 0:
            w.writerow(["circuit_name","src","dst"])
        for src, dst in edges:
            w.writerow([circuit_name, src, dst])

def process_folder_edges(folder_path, out_csv):
    # reset file
    if os.path.exists(out_csv):
        os.remove(out_csv)
    vfiles = [fn for fn in os.listdir(folder_path) if fn.endswith(".v")]
    if not vfiles:
        print(f"⚠️ No .v files in {folder_path}")
        return False
    for fn in vfiles:
        try:
            full = os.path.join(folder_path, fn)
            print(f"Processing {fn} ...")
            nodes, edges, io = parse_verilog_graph_edges(full)
            edges_to_csv(edges, fn, out_csv)
        except Exception as e:
            print(f"⚠️ Skipping {fn} due to error: {e}")
    print(f"✅ Saved edges → {out_csv}")
    return True

if __name__ == "__main__":
    os.makedirs("GNNDatasets", exist_ok=True)

    # 1) Node-level edges (clean netlists)
    clean_dir = "verilog_benchmark_circuits"
    node_edges_csv = "GNNDatasets/node_edges.csv"
    ok_node = process_folder_edges(clean_dir, node_edges_csv)

    # 2) Subgraph-level edges (Trojanized variants)
    sub_edges_andxor = "GNNDatasets/subgraph_edges_andxor.csv"
    sub_edges_counter = "GNNDatasets/subgraph_edges_countermux.csv"
    sub_edges_fsmor = "GNNDatasets/subgraph_edges_fsmor.csv"

    dir_andxor = "Trojanized_ISCAS_EPFL_andxor"
    dir_counter = "Trojanized_ISCAS_EPFL_countermux"
    dir_fsmor = "Trojanized_ISCAS_EPFL_fsmor"

    ok_andxor = process_folder_edges(dir_andxor, sub_edges_andxor)
    ok_counter = process_folder_edges(dir_counter, sub_edges_counter)
    ok_fsmor  = process_folder_edges(dir_fsmor, sub_edges_fsmor)

    # 3) Graph-level edges = merge everything that exists
    merge_list = []
    for p in [node_edges_csv, sub_edges_andxor, sub_edges_counter, sub_edges_fsmor]:
        if os.path.exists(p):
            merge_list.append(pd.read_csv(p))
    if merge_list:
        merged = pd.concat(merge_list, ignore_index=True)
        merged.to_csv("GNNDatasets/graph_edges.csv", index=False)
        print("✅ Saved merged graph-level edges → GNNDatasets/graph_edges.csv")
    else:
        print("❌ No edge files to merge for graph-level.")


Processing Priority.v ...
Processing adder.v ...
Processing arbiter.v ...
Processing bar.v ...
Processing c1355.v ...
Processing c17.v ...
Processing c1908.v ...
Processing c2670.v ...
Processing c3540.v ...
Processing c432.v ...
Processing c499.v ...
Processing c5315.v ...
Processing c6288.v ...
Processing c7552.v ...
Processing c880.v ...
Processing cavlc.v ...
Processing ctrl.v ...
Processing dec.v ...
Processing i2c.v ...
Processing int2float.v ...
Processing max.v ...
Processing router.v ...
Processing sin.v ...
✅ Saved edges → GNNDatasets/node_edges.csv
Processing Priority__trojan_andxor.v ...
Processing adder__trojan_andxor.v ...
Processing arbiter__trojan_andxor.v ...
Processing bar__trojan_andxor.v ...
Processing c1355__trojan_andxor.v ...
Processing c17__trojan_andxor.v ...
Processing c1908__trojan_andxor.v ...
Processing c2670__trojan_andxor.v ...
Processing c3540__trojan_andxor.v ...
Processing c432__trojan_andxor.v ...
Processing c499__trojan_andxor.v ...
Processing c5315_