## Validate Source Code Existence

In [1]:
from pathlib import Path
import shutil
import os
from loguru import logger

FINDING_PATH = "../dataset-curated/findings"
VERIFY_PATH = "../dataset-curated/manual_verification"
CONTRACTS_PATH = "../dataset-curated/contracts"
CONTRACTS_RAW_PATH = "../dataset-curated/contracts-raw"


finding_path = Path(FINDING_PATH).glob("*.json")
verify_path = Path(VERIFY_PATH).glob("*.json")


# del finding.json that in manual_verification
# then delete contracts/finding.pdf-source

for verify_file in verify_path:
    finding_file = Path(FINDING_PATH) / verify_file.name
    if finding_file.exists():
        # finding_file.unlink()
        logger.info(f"Deleted finding file: {finding_file}")
    else:
        # logger.warning(f"Finding file not found for deletion: {finding_file}")
        pass

    # Delete corresponding source code directory if exists
    source_dir_pdf = Path(CONTRACTS_PATH) / f"{verify_file.stem}.pdf-source"
    source_dir_md = Path(CONTRACTS_PATH) / f"{verify_file.stem}.md-source"
    if source_dir_pdf.exists() and source_dir_pdf.is_dir():
        # shutil.rmtree(source_dir_pdf)
        logger.info(f"Deleted source code directory: {source_dir_pdf}")
    else:
        # logger.warning(f"Source code directory not found for deletion: {source_dir_pdf}")
        pass
    if source_dir_md.exists() and source_dir_md.is_dir():
        # shutil.rmtree(source_dir_md)
        logger.info(f"Deleted source code directory: {source_dir_md}")
    else:
        # logger.warning(f"Source code directory not found for deletion: {source_dir_md}")
        pass

# print file number in "findings" and "manual_verification" and dir number in "contracts"
finding_files = list(finding_path)
verify_files = list(Path(VERIFY_PATH).glob("*.json"))
contracts_dirs = [d for d in Path(CONTRACTS_PATH).iterdir() if d.is_dir()]

for finding_file in finding_files:
    if str(finding_file.stem)+".md-source"in [d.name for d in contracts_dirs] or str(finding_file.stem)+".pdf-source" in [d.name for d in contracts_dirs]:
        # logger.info(f"Finding file has corresponding source code: {finding_file}")
        continue
    else:
        logger.warning(f"Finding file does not have corresponding source code: {finding_file}")

logger.info(f"Number of finding files: {len(finding_files)}")
logger.info(f"Number of manual verification files: {len(verify_files)}")
logger.info(f"Number of contract directories: {len(contracts_dirs)}")

[32m2026-02-11 15:34:17.932[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m56[0m - [1mNumber of finding files: 209[0m
[32m2026-02-11 15:34:17.934[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m57[0m - [1mNumber of manual verification files: 0[0m
[32m2026-02-11 15:34:17.934[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m58[0m - [1mNumber of contract directories: 209[0m


## Unify Finding Severity

In [None]:
import json
from pathlib import Path
from loguru import logger

finding_path = Path(FINDING_PATH).glob("*.json")

severity_mapping = {
    "critical": "Critical",
    "high": "High",
    "high risk": "High",
    "medium": "Medium",
    "low": "Low",
    "low risk": "Low",
    "informational": "Informational",
    "major": "Critical",
    "minor": "Low",
    "info": "Informational",
    "note": "Informational",
    "warning": "Informational",
    "gas": "Informational",
    "gas optimization": "Informational",
    "indeterminate": "Informational",
    "undetermined": "Informational",
    "note/information": "Informational",
    "non-critical": "Informational",
}

for finding_file in finding_path:
    with open(finding_file, "r") as f:
        finding_dict = json.load(f)
    if finding_dict.get("project_info",{}).get("project_path") =="n/a":
        logger.warning(f"Finding missing project_path: {finding_file}")
    for finding in finding_dict["findings"]:
        # print(finding["id"], finding["severity"])
        if "severity" not in finding:
            logger.warning(
                f"Finding missing severity: {finding_file} - {finding['id']}"
            )
        if finding["severity"] is None:
            # logger.warning(
            #     f"Finding has null severity: {finding_file} - {finding['id']}"
            # )
            continue
        if finding["severity"].lower() in severity_mapping:
            finding["severity"] = severity_mapping[finding["severity"].lower()]
        else:
            logger.warning(
                f"Unknown severity level: {finding_file} - {finding['id']} - {finding['severity']}"
            )
    with open(finding_file, "w") as f:
        json.dump(finding_dict, f, indent=4)

## Validate File Existence

In [2]:
import json
from pathlib import Path
from loguru import logger

finding_path = Path(FINDING_PATH).glob("*.json")

invalid_file_count = 0
total_file_count = 0
for finding_file in finding_path:
    with open(finding_file, "r") as f:
        finding_dict = json.load(f)
    if finding_dict.get("project_info",{}).get("project_path") =="n/a":
        logger.warning(f"Finding missing project_path: {finding_file}")
    dataset_root_path = Path(list(finding_dict.get("project_info",{}).get("project_path",{}).values())[0])
    for finding in finding_dict["findings"]:
        if "files" not in finding:
            logger.warning(
                f"Finding missing files field: {finding_file} - {finding['id']}"
            )
            continue
        files_list = finding["files"]
        
        for ff in files_list:
            rel_path  = Path("../") /dataset_root_path / ff
            if not (rel_path).exists() and rel_path.suffix in [".sol"]:
                # print(dataset_root_path / ff)
                invalid_file_count += 1
                logger.warning(
                    f"File in finding does not exist: {rel_path} - {finding['id']} - {ff}"
                )
            total_file_count += 1
print(f"Total files: {total_file_count}, Invalid files: {invalid_file_count}")



Total files: 3041, Invalid files: 57


## Generate Vulnerability-File Pairs

In [3]:
from typing import List, Dict, Union, Optional, Literal, Tuple,Set
from pydantic import BaseModel, RootModel, Field
from dataclasses import field, dataclass
import json

@dataclass
class ProjectInfo:
    url: Union[str, int, List, None] = "n/a"
    commit_id: Union[str, int, List, None] = "n/a"
    address: Union[str, int, List, None] = "n/a"
    chain: Union[str, int, List, None] = "n/a"
    compiler_version: Union[str, List, None] = "n/a"
    audit_date: Union[str, int, List, None] = "n/a"
    project_path: Union[str, List, Dict, None] = "n/a"

    def is_empty(self):
        if (self.url == "n/a" and self.address == "n/a") or (
            not self.url and not self.address
        ):
            return True
        return False
    def __hash__(self):
        return hash((self.url, self.commit_id, self.address, self.chain, self.compiler_version, self.audit_date, str(self.project_path)))
    


@dataclass
class Finding:
    id: Union[str, int] = 0
    category: Dict = field(default_factory=dict)
    title: str = ""
    description: str = ""
    severity: Optional[str] = ""
    location: Union[str, int, List] = ""
    files: List[str] = field(default_factory=list)
    
    def __hash__(self):
        return hash((self.id, self.category, self.title, self.description, self.severity, self.location, tuple(self.files)))

class Report(BaseModel):
    path: str = ""
    project_info: ProjectInfo = field(default_factory=ProjectInfo)
    findings: List[Finding] = field(default_factory=list)

    def append_finding(self, finding: Finding):
        self.findings.append(finding)

    def __hash__(self):
        return hash((self.path, self.project_info, tuple(self.findings)))

class VulnerabilityFilePair(BaseModel):
    vfp_id: str = "" # Unique ID for the VulnerabilityFilePair, e.g., 'vfp_00001'
    project_name: str = ""
    findings: List[Finding] = Field(default_factory=list)
    affected_files: Dict[str, str] = Field(default_factory=dict)

    def __hash__(self):
        return hash((self.vfp_id, self.project_name,tuple(self.findings), tuple(self.affected_files)))
    


In [None]:
ONLY_VULN = False


def load_report(report_path: str) -> Report:
    """Load a report from a JSON file."""
    import json

    with open(report_path, "r") as f:
        data = json.load(f)
    return Report.model_validate(data)


def process_reports(
    input_path: Union[str, Path],
    dataset_path="../",
    output_vfp_dir="../dataset-curated/vfp",
) -> List[VulnerabilityFilePair]:
    """
    Process reports from a file or directory and generate VulnPairs.
    Aggregates findings based on overlapping files.
    Saves each VFP to output_vfp_dir as {vfp_id}.json
    """
    input_path = Path(input_path)
    output_vfp_dir = Path(output_vfp_dir)
    output_vfp_dir.mkdir(parents=True, exist_ok=True)

    # Convert dataset_path to absolute path for consistent path operations
    dataset_path_abs = Path(dataset_path).resolve()

    reports = []

    if input_path.is_file():
        if input_path.suffix == ".json":
            reports.append(load_report(str(input_path)))
    elif input_path.is_dir():
        for file_path in input_path.glob("*.json"):
            reports.append(load_report(str(file_path)))

    vuln_pairs = []
    global_vfp_counter = 1  # Global counter for VFP IDs

    for report in reports:

        file_to_findings: Dict[str, Set[int]] = {}
        finding_files: Dict[int, Set[str]] = {}

        project_root = Path(list(report.project_info.project_path.values())[0])

        valid_findings = []
        for i, finding in enumerate(report.findings):

            # Filter out findings with no files or no category
            if not finding.files:
                continue

            if ONLY_VULN:
                if finding.severity is None or finding.severity.lower() not in [
                    "critical",
                    "high",
                    "medium",
                ]:
                    continue

            # Resolve file paths
            resolved_files = set()
            for f_rel in finding.files:
                # Sometimes paths might be absolute or relative
                # Assuming relative to project_path as per instruction
                try:
                    abs_path = (Path(dataset_path) / project_root / f_rel).resolve()
                    # get file's extension
                    ext = abs_path.suffix.lower()
                    if ext in {".md", ".pdf.json"}:
                        continue
                    if ext not in {".rs", ".ts", ".sol", ".toml", ".sh", ".json"}:
                        # print(f"Resolved file: {abs_path} with extension {ext}")
                        continue

                    # if not os.path.exists(abs_path):
                    #     print(f"Warning: Resolved file does not exist: {abs_path}")
                    if os.path.exists(abs_path):

                        resolved_files.add(str(abs_path))

                except Exception:
                    print(
                        f"Warning: Could not resolve file path {f_rel} in project {project_root}"
                    )
                    continue

            if not resolved_files:
                continue

            valid_findings.append(finding)
            f_idx = len(valid_findings) - 1
            finding_files[f_idx] = resolved_files

            for f_path in resolved_files:
                if f_path not in file_to_findings:
                    file_to_findings[f_path] = set()
                file_to_findings[f_path].add(f_idx)

        # 2. Build connected components of findings
        # Two findings are connected if they share a file
        # We can use Union-Find or BFS/DFS

        num_findings = len(valid_findings)
        visited_findings = [False] * num_findings

        for i in range(num_findings):
            # print(f"Finding {i}: visited={visited_findings[i]}")  # Debugging line
            if visited_findings[i]:
                continue

            # Start a new component
            component_findings_indices = set()
            queue = [i]
            visited_findings[i] = True

            while queue:
                # Debugging line
                curr_idx = queue.pop(0)
                component_findings_indices.add(curr_idx)

                # Get all files for this finding
                files = finding_files[curr_idx]

                # For each file, get all other findings that touch it
                for f_path in files:
                    linked_findings = file_to_findings[f_path]
                    for linked_idx in linked_findings:
                        if not visited_findings[linked_idx]:
                            visited_findings[linked_idx] = True
                            queue.append(linked_idx)

            # 3. Construct VulnPair for this component
            comp_files = set()

            for f_idx in component_findings_indices:
                comp_files.update(finding_files[f_idx])

            # Read file contents and build affected_files dict {filename: content}
            affected_files_dict = {}
            for f_path in comp_files:
                try:
                    with open(f_path, "r", encoding="utf-8") as f:
                        content = f.read()
                    # Use relative path or just filename as key
                    file_key = os.path.basename(f_path)
                    # If there are duplicate filenames, use relative path from dataset root
                    if file_key in affected_files_dict:
                        try:
                            file_key = str(Path(f_path).relative_to(dataset_path_abs))
                        except ValueError:
                            # If relative_to fails, use full path as fallback
                            file_key = f_path
                    affected_files_dict[file_key] = content
                except Exception as e:
                    print(f"Warning: Could not read file {f_path}: {e}")
                    continue

            # Generate global VFP ID
            vfp_id = f"vfp_{global_vfp_counter:05d}"
            global_vfp_counter += 1

            vp = VulnerabilityFilePair(
                vfp_id=vfp_id,
                project_name=Path(report.path).name,
                findings=[valid_findings[idx] for idx in component_findings_indices],
                affected_files=affected_files_dict,
            )
            vuln_pairs.append(vp)

            # Save VFP to file
            vfp_output_path = output_vfp_dir / f"{vfp_id}.json"
            try:
                with open(vfp_output_path, "w", encoding="utf-8") as f:
                    json.dump(vp.model_dump(), f, indent=4, ensure_ascii=False)
                # print(f"Saved VFP: {vfp_output_path}")
            except Exception as e:
                print(f"Error saving VFP {vfp_id}: {e}")

    return vuln_pairs


VFP_DIR = "../flatten/vfp"
VFP_VULN_DIR = "../flatten/vfp-vuln"


if ONLY_VULN:
    for file in Path(VFP_VULN_DIR).glob("*.json"):
        try:
            file.unlink()
        except Exception as e:
            logger.warning(f"Could not delete VFP vuln file {file}: {e}")
    process_reports(input_path=FINDING_PATH, output_vfp_dir=VFP_VULN_DIR)
else:
    for file in Path(VFP_DIR).glob("*.json"):
        try:
            file.unlink()
        except Exception as e:
            logger.warning(f"Could not delete VFP file {file}: {e}")
    process_reports(input_path=FINDING_PATH, output_vfp_dir=VFP_DIR)


print("Processing complete.")

Processing complete.


## Extract Only Solidity Files

In [None]:
import os
import shutil
from pathlib import Path
from loguru import logger
contracts_raw_dir = Path(CONTRACTS_RAW_PATH)
contracts_dir = Path(CONTRACTS_PATH)


for root, dirs, files in os.walk(contracts_raw_dir):
    relative_path = os.path.relpath(root, contracts_raw_dir)
    target_dir = contracts_dir / relative_path
    for file in files:
        if file.endswith(".sol"):
            source_file = Path(root) / file
            target_file = target_dir / file
            target_file.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(source_file, target_file)
            # logger.info(f"Copied {source_file} to {target_file}")
