In [3]:
import os
import json
import hashlib
from collections import defaultdict

projects = os.listdir("beetlebox_dataset")

for project in projects:
    project_path = os.path.join("beetlebox_dataset", project)
    if not os.path.isdir(project_path):
        continue

    chunk_counter = defaultdict(list)
    bugs = os.listdir(project_path)

    # First pass: count all chunks (by hash) in this project
    for bug in bugs:
        bug_path = os.path.join(project_path, bug)
        if not os.path.isdir(bug_path):
            continue
        chunk_file = os.path.join(bug_path, "chunks.json")
        if not os.path.exists(chunk_file):
            continue
        with open(chunk_file, "r", encoding="utf-8") as f:
            try:
                chunks = json.load(f)
            except Exception as e:
                print(f"Error reading {chunk_file}: {e}")
                continue
            for chunk in chunks:
                code = chunk.get("chunk")
                if not code:
                    continue
                h = hashlib.sha256(code.encode("utf-8")).hexdigest()
                # Only keep a *single* representative chunk per hash (minimize memory)
                if len(chunk_counter[h]) == 0:
                    chunk_counter[h].append(chunk)
                else:
                    # We only need to know that it's duplicated, so just append a dummy for counting
                    chunk_counter[h].append(None)

    # Find all duplicated hashes in this project
    duplicated_hashes = {h for h, chunk_list in chunk_counter.items() if len(chunk_list) > 1}

    # Save representative duplicated chunks
    duplicated_chunks = [
        chunk_list[0] for h, chunk_list in chunk_counter.items() if len(chunk_list) > 1
    ]

    with open(os.path.join(project_path, "duplicated_chunks.json"), "w", encoding="utf-8") as f:
        json.dump(duplicated_chunks, f, indent=2, ensure_ascii=False)

    # Second pass: write unique_chunks.json per bug, subtracting duplicates
    for bug in bugs:
        bug_path = os.path.join(project_path, bug)
        if not os.path.isdir(bug_path):
            continue
        chunk_file = os.path.join(bug_path, "chunks.json")
        if not os.path.exists(chunk_file):
            continue

        unique_chunks = []
        with open(chunk_file, "r", encoding="utf-8") as f:
            try:
                chunks = json.load(f)
            except Exception as e:
                print(f"Error reading {chunk_file}: {e}")
                continue
            for chunk in chunks:
                code = chunk.get("chunk")
                if not code:
                    continue
                h = hashlib.sha256(code.encode("utf-8")).hexdigest()
                if h not in duplicated_hashes:
                    unique_chunks.append(chunk)

        with open(os.path.join(bug_path, "unique_chunks.json"), "w", encoding="utf-8") as f:
            json.dump(unique_chunks, f, indent=2, ensure_ascii=False)

    print(f"finished {project}")


finished cryptomator_cryptomator
finished fastify_fastify
finished onnx_onnx
finished facebook_react
finished liquibase_liquibase
finished ccxt_ccxt
finished SeleniumHQ_selenium
finished mozilla_pdf.js
finished electron_electron
finished ansible_ansible
finished protocolbuffers_protobuf
finished bitcoin_bitcoin
finished google_guava
finished apache_dubbo
finished nats-io_nats-server
finished OpenRefine_OpenRefine
finished langchain-ai_langchain
finished sveltejs_svelte


In [3]:
%%bash

DATASET_DIR="beetlebox_dataset"
OUTPUT_ARCHIVE="beetlebox.tar.gz"

echo "Creating archive of unique_chunks.json and duplicated_chunks.json under $DATASET_DIR..."

find "$DATASET_DIR" -type f \( -name 'unique_chunks.json' -o -name 'duplicated_chunks.json' \) > chunk_file_list.txt

# Check if any files were found
if [ ! -s chunk_file_list.txt ]; then
    echo "No unique_chunks.json or duplicated_chunks.json files found in $DATASET_DIR"
    rm chunk_file_list.txt
    exit 1
fi

tar --transform "s|^./||" -czf "$OUTPUT_ARCHIVE" -T chunk_file_list.txt

echo "Archive created: $OUTPUT_ARCHIVE"
rm chunk_file_list.txt


Creating archive of unique_chunks.json and duplicated_chunks.json under beetlebox_dataset...
Archive created: beetlebox.tar.gz
