### Analyse label structure

In [1]:
# This cell reads the JSON file and prints the first two levels of its structure.
import json

file_path = "/workspaces/VulHunter/VulHunter/input2/dataset_1_vul_two_one_names_labels.json"

def print_two_level_structure(data, indent=0):
    indent_str = "    " * indent
    if isinstance(data, dict):
        for key, value in data.items():
            # Print the top-level key
            print(f"{indent_str}{key}:")
            # If its value is a dictionary, print its keys as subfields
            if isinstance(value, dict):
                for sub_key in value.keys():
                    print(f"{indent_str}    {sub_key}")
            # If its value is a list, try to show structure from first element if it's a dict
            elif isinstance(value, list) and value:
                print(f"{indent_str}    [list]")
                first_elem = value[0]
                if isinstance(first_elem, dict):
                    for sub_key in first_elem.keys():
                        print(f"{indent_str}        {sub_key}")
            else:
                print(f"{indent_str}    [value]")
    else:
        print("Data is not a dictionary at the top level.")

with open(file_path, "r") as f:
    data = json.load(f)

print_two_level_structure(data)

reentrancy-eth:
    train_labels
    train_names
    test_labels
    test_names
controlled-array-length:
    train_labels
    train_names
    test_labels
    test_names
suicidal:
    train_labels
    train_names
    test_labels
    test_names
controlled-delegatecall:
    train_labels
    train_names
    test_labels
    test_names
arbitrary-send:
    train_labels
    train_names
    test_labels
    test_names
tod:
    train_labels
    train_names
    test_labels
    test_names
uninitialized-state:
    train_labels
    train_names
    test_labels
    test_names
incorrect-equality:
    train_labels
    train_names
    test_labels
    test_names
integer-overflow:
    train_labels
    train_names
    test_labels
    test_names
unchecked-lowlevel:
    train_labels
    train_names
    test_labels
    test_names
tx-origin:
    train_labels
    train_names
    test_labels
    test_names
locked-ether:
    train_labels
    train_names
    test_labels
    test_names
unchecked-send:
    train_label

In [2]:
# This cell analyzes the "reentrancy-eth" subfield of the JSON,
# printing the number of entries in each of its subfields (assuming they are arrays).
import json

file_path = "/workspaces/VulHunter/VulHunter/input2/dataset_1_vul_two_one_names_labels.json"

with open(file_path, "r") as f:
    data = json.load(f)

# Ensure the "reentrancy-eth" key exists at the top level
if "reentrancy-eth" not in data:
    print('The key "reentrancy-eth" was not found in the JSON data.')
else:
    reentrancy_eth = data["reentrancy-eth"]
    if not isinstance(reentrancy_eth, dict):
        print('The "reentrancy-eth" subfield is not a dictionary.')
    else:
        for subfield, value in reentrancy_eth.items():
            if isinstance(value, list):
                print(f'Subfield "{subfield}" contains {len(value)} entries.')
            else:
                print(f'Subfield "{subfield}" is not a list.')

Subfield "train_labels" contains 1018 entries.
Subfield "train_names" contains 1018 entries.
Subfield "test_labels" contains 255 entries.
Subfield "test_names" contains 255 entries.


In [3]:
# This cell reads the JSON file, extracts the "reentrancy-eth" subfield, 
# and writes it to a new JSON file with the suffix _reentrancy.

import json
import os

input_file = "/workspaces/VulHunter/VulHunter/input2/dataset_1_vul_two_one_names_labels.json"
output_file = input_file.replace(".json", "_reentrancy.json")

with open(input_file, "r") as f:
    data = json.load(f)

if "reentrancy-eth" in data:
    new_data = {"reentrancy-eth": data["reentrancy-eth"]}
    with open(output_file, "w") as f:
        json.dump(new_data, f, indent=4)
    print(f'New JSON with reentrancy-eth subfield saved to: {output_file}')
else:
    print('The key "reentrancy-eth" was not found in the JSON data.')

New JSON with reentrancy-eth subfield saved to: /workspaces/VulHunter/VulHunter/input2/dataset_1_vul_two_one_names_labels_reentrancy.json


In [4]:
# This cell loads two JSON files:
# keys.json: contains filtering keys in arrays "test_names" and "train_names" under the "reentrancy_eth" or "reentrancy-eth" subfield.
# input.json: the JSON to be filtered.
# It then filters input.json to only include keys present in the union of "test_names" and "train_names",
# and writes the resulting JSON to a new file with the suffix "_reentrancy.json".

import json
import os

# Paths for the JSON files
keys_file = "/workspaces/VulHunter/VulHunter/input2/dataset_1_vul_two_one_names_labels.json"
input_file = "/workspaces/VulHunter/VulHunter/input2/contract_bytecodes_list10.json"
output_file = input_file.replace(".json", "_reentrancy.json")

# Load the keys JSON file
with open(keys_file, "r") as f:
    keys_data = json.load(f)

# Try to get the filtering keys from "reentrancy_eth" or "reentrancy-eth"
reentrancy_data = None
if "reentrancy_eth" in keys_data and isinstance(keys_data["reentrancy_eth"], dict):
    reentrancy_data = keys_data["reentrancy_eth"]
elif "reentrancy-eth" in keys_data and isinstance(keys_data["reentrancy-eth"], dict):
    reentrancy_data = keys_data["reentrancy-eth"]

if not reentrancy_data:
    print('No valid "reentrancy_eth" or "reentrancy-eth" subfield found in keys JSON.')
else:
    test_names = reentrancy_data.get("test_names", [])
    train_names = reentrancy_data.get("train_names", [])
    # Merge the two lists into a set for efficient lookup.
    filter_keys = set(test_names + train_names)
    
    # Load the input JSON file to be filtered
    with open(input_file, "r") as f:
        input_data = json.load(f)

    # Filter the input JSON: keep only keys that are in filter_keys
    filtered_data = { key: value for key, value in input_data.items() if key in filter_keys }
    
    # Write the filtered data to the output JSON file
    with open(output_file, "w") as f:
        json.dump(filtered_data, f, indent=4)
    
    print(f"Filtered JSON saved to: {output_file}")

Filtered JSON saved to: /workspaces/VulHunter/VulHunter/input2/contract_bytecodes_list10_reentrancy.json
