# Chunking BugsInPy Dataset
## IMPORTANT: You need at least a T4 GPU & 6gbs of free space in your GDrive
### Overall flow of the notebook:

1. Clone the project
2. Install dependencies
2. Chunk the projects and save the results
3. Embed each chunk and save it


### Importing

In [1]:
from scripts.bugsinpy_utils import *
from utils.run_ast_old import extract_chunks, get_python_files
import os, json


### Get unique files and common files for more efficient chunking

In [3]:

projects = get_projects()
for project in projects:
    clone_project(project)
    bugs = get_bugs(project)
    for bug in bugs:
        info = get_bug_info(project, bug)
        bug_id = f"{project}:{bug}"
        checkout_to_commit(project, info["buggy_commit_id"])
        python_files = get_python_files(f"tmp/{project}")
                
        os.makedirs(f"dataset/{project}/{bug}", exist_ok=True)
        chunks = extract_chunks(python_files)
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "w", encoding="utf-8"
        ) as f:
            json.dump(chunks, f, indent=2)

Cloning into 'ansible'...
fatal: reference is not a tree: 7c65ad11e2914bc9774abd37cdd1ac455f1c9433
Processing files: 100%|██████████| 582/582 [00:05<00:00, 115.47file/s]
fatal: reference is not a tree: d8dc76e134fa458690acbd70f0cb9a009dbb5e29
Processing files: 100%|██████████| 582/582 [00:01<00:00, 296.78file/s]
fatal: reference is not a tree: 5ad77fc7bb529d9733a17c1ef5d24a84b98f50d3
Processing files: 100%|██████████| 582/582 [00:01<00:00, 363.93file/s]
fatal: reference is not a tree: a168e73713f896b75487ce22306490de9ed2b3ce
Processing files: 100%|██████████| 582/582 [00:01<00:00, 368.39file/s]
fatal: reference is not a tree: a1ab093ddbd32f1002cbf6d6f184c7d0041d890d
Processing files: 100%|██████████| 582/582 [00:01<00:00, 331.19file/s]
fatal: reference is not a tree: 22d93d9496e08acac5a0063e641900efe22ef013
Processing files: 100%|██████████| 582/582 [00:01<00:00, 344.64file/s]
fatal: reference is not a tree: cd8ce16d4830782063692d897e57bd0af33ab5db
Processing files: 100%|██████████| 58

### Embedding each chunk
First we load all the chunks, and pick unique chunks to embed to prevent duplication and wasted compute time

In [6]:
from scripts.embedding import BATCH_SIZE, embed
import numpy as np


projects = get_projects()
embedding_cache = {}
texts_to_embed = []
text_to_indices = {}

# First pass: collect all unique texts and track their usage
for project in projects:
    bugs = get_bugs(project)
    for bug in bugs:
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "r", encoding="utf-8"
        ) as f:
            chunks = json.load(f)
            texts = [chunk["code"] for chunk in chunks]
            indices = []

            for text in texts:
                if text not in embedding_cache:
                    if text not in text_to_indices:
                        text_to_indices[text] = []
                        texts_to_embed.append(text)
                    text_to_indices[text].append((project, bug))
                else:
                    # Already cached
                    pass

# Second pass: Embed all unique texts in batch
print(f"Embedding {len(texts_to_embed)} unique chunks...")
all_embeddings = embed(texts_to_embed, batch_size=BATCH_SIZE, show_progress_bar=True)

# Populate cache
for i, text in enumerate(texts_to_embed):
    embedding_cache[text] = all_embeddings[i]

# Final pass: write per-bug embedding files
for project in projects:
    bugs = get_bugs(project)
    for bug in bugs:
        with open(
            f"dataset/{project}/{bug}/code_chunks.json", "r", encoding="utf-8"
        ) as f:
            chunks = json.load(f)
            texts = [chunk["code"] for chunk in chunks]
            embeddings = np.array([embedding_cache[text] for text in texts])
            os.makedirs(f"dataset/{project}/{bug}", exist_ok=True)
            np.save(f"dataset/{project}/{bug}/embedding.npy", embeddings)


Batches:   0%|          | 0/70 [00:00<?, ?it/s]