In [1]:
from scripts.bugsinpy_utils import *
from run_ast_old import extract_chunks, get_python_files, hash_file
import numpy
import os, json
from collections import defaultdict


### Get unique files and common files for more efficient chunking

In [2]:
projects = get_projects()
projects.remove("ansible")
for project in projects:
    clone_project(project)
    bugs = get_bugs(project)
    for bug in bugs:
        info = get_bug_info(project, bug)
        bug_id = f"{project}:{bug}"
        checkout_to_commit(project, info["buggy_commit_id"])
        python_files = get_python_files(f"tmp/{project}")
                
        os.makedirs(f"dataset/{project}/{bug}", exist_ok=True)
        chunks = extract_chunks(python_files)
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "w", encoding="utf-8"
        ) as f:
            json.dump(chunks, f, indent=2)

Previous HEAD position was fb9e72309a Merge pull request #17462 from anntzer/gtk_render_figure
HEAD is now at cdf9e30e4f Merge pull request #14610 from meeseeksmachine/auto-backport-of-pr-14579-on-v3.1.x
Processing files: 100%|██████████| 768/768 [00:01<00:00, 394.24file/s]
Previous HEAD position was cdf9e30e4f Merge pull request #14610 from meeseeksmachine/auto-backport-of-pr-14579-on-v3.1.x
HEAD is now at dbc35a9d62 Merge pull request #16683 from anntzer/gluespec
Processing files: 100%|██████████| 761/761 [00:01<00:00, 405.05file/s]
Previous HEAD position was dbc35a9d62 Merge pull request #16683 from anntzer/gluespec
HEAD is now at e92685a264 Merge pull request #15254 from anntzer/propagate-decorators
Processing files: 100%|██████████| 754/754 [00:01<00:00, 439.99file/s]
Previous HEAD position was e92685a264 Merge pull request #15254 from anntzer/propagate-decorators
HEAD is now at fb9e72309a Merge pull request #17462 from anntzer/gtk_render_figure
Processing files: 100%|██████████| 

In [None]:
### Save the hashes for later use

In [6]:
from scripts.embedding import BATCH_SIZE, embed

projects = get_projects()
embedding_cache = {}
texts_to_embed = []
text_to_indices = {}

# First pass: collect all unique texts and track their usage
for project in projects:
    bugs = get_bugs(project)
    for bug in bugs:
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "r", encoding="utf-8"
        ) as f:
            chunks = json.load(f)
            texts = [chunk["code"] for chunk in chunks]
            indices = []

            for text in texts:
                if text not in embedding_cache:
                    if text not in text_to_indices:
                        text_to_indices[text] = []
                        texts_to_embed.append(text)
                    text_to_indices[text].append((project, bug))
                else:
                    # Already cached
                    pass

# Second pass: Embed all unique texts in batch
print(f"Embedding {len(texts_to_embed)} unique chunks...")
all_embeddings = embed(texts_to_embed, batch_size=BATCH_SIZE, show_progress_bar=True)

# Populate cache
for i, text in enumerate(texts_to_embed):
    embedding_cache[text] = all_embeddings[i]

# Final pass: write per-bug embedding files
for project in projects:
    bugs = get_bugs(project)
    for bug in bugs:
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "r", encoding="utf-8"
        ) as f:
            chunks = json.load(f)
            texts = [chunk["code"] for chunk in chunks]
            embeddings = np.array([embedding_cache[text] for text in texts])
            os.makedirs(f"dataset/{project}/{bug}", exist_ok=True)
            np.save(f"dataset/{project}/{bug}/embedding.npy", embeddings)


Batches:   0%|          | 0/70 [00:00<?, ?it/s]