In [1]:
import os
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "3")
from BGERetriever_v3 import BGERetriever
from FlagEmbedding import BGEM3FlagModel


In [2]:
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True, device="cuda", normalize_embeddings=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [14]:
retriever = BGERetriever(
    embedding_model=model,
    index_path="/local00/student/shakya/bge_m3_faiss.index",
    metadata_path="/local00/student/shakya/chunk_metadata.jsonl",             # ✅ use JSONL with text
    example_map_path="data/openmathinstruct2/example_id_to_data.json",        # same as before
    nprobe=64,
    use_all_gpus=True,
    chunk_texts_path="/local00/student/shakya/chunk_texts.json",              # ✅ optional but helps reranker
)

Loading FAISS index from /local00/student/shakya/bge_m3_faiss.index
Set IVF nprobe = 64
Index dimension: 1024
Index total vectors: 13972791
Loading chunk metadata from /local00/student/shakya/chunk_metadata.jsonl
Detected JSONL; loaded 13972791 entries
Loaded 13972791 chunk texts from /local00/student/shakya/chunk_texts.json
Loading example map from data/openmathinstruct2/example_id_to_data.json
Loaded 13972791 full examples


In [4]:
print(retriever.get_stats())

{'num_documents': 13972791, 'num_examples': 13972791, 'index_dimension': 1024, 'index_total_vectors': 13972791, 'device': 'cuda'}


In [15]:
q = "Formula for calculating new value after a percentage increase"

In [16]:
hits = retriever.search_and_rerank(q, top_k=200, top_k_final=10)

Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.42it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.74s/it]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.76it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.03it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.54it/s]
Compute Scores: 100%|█████████████████████████████████████████████████

In [20]:
for i, r in enumerate(hits, 1):
    print(f"\n=== Reranked {i} ===")
    print("Score:", r.get("rerank_score", r["dense_score"]))
    print("Chunk ID:", r["chunk_id"])
    print("Full Problem:", (r["full_problem"] or r["problem"])[:250])
    print("Full Solution:", (r["full_solution"] or r["solution_chunk"])[:300], "...")


=== Reranked 1 ===
Score: 0.7856127023696899
Chunk ID: 7546008_0
Full Problem: The value of a share increased from $120 to $150 in one month. What was the percentage increase in the value of the share?
Full Solution: The formula for percentage increase is $\left( \frac{\text{new value} - \text{old value}}{\text{old value}} \right) \times 100$.

In this case, the old value is $120 and the new value is $150.

Plugging these values into the formula:
\[ \left( \frac{150 - 120}{120} \right) \times 100 = \left( \frac{ ...

=== Reranked 2 ===
Score: 0.7711079120635986
Chunk ID: 9891553_0
Full Problem: The value of a share increased from $120 to $150 in one month. What was the percentage increase in the value of the share?
Full Solution: To find the percentage increase, first find the difference in value:
\[ \text{Increase} = \text{New Value} - \text{Old Value} = 150 - 120 = 30 \]

Then, calculate the percentage increase using the formula:
\[ \text{Percentage Increase} = \left( \frac{\text{In

In [10]:
import json
from pathlib import Path

In [11]:
CHUNK_JSONL = "./data/openmathinstruct2/openmath_chunks.jsonl"
OUT_META_JSONL = "/local00/student/shakya/chunk_metadata.jsonl"
OUT_TEXTS_JSON = "/local00/student/shakya/chunk_texts.json"

In [12]:
def main():
    texts = []
    n = 0
    with open(CHUNK_JSONL, "r", encoding="utf-8") as fin, \
         open(OUT_META_JSONL, "w", encoding="utf-8") as fmeta:
        for i, line in enumerate(fin):
            o = json.loads(line)
            # Expect structure produced by your chunker: {"chunk_id": "<eid>_<j>", "example_id": <eid>, "text": "..."}
            chunk_id = o.get("chunk_id", "")
            row_id = o.get("example_id", None)
            text = o.get("text", "")

            # Write metadata JSONL (one object per FAISS vector id)
            meta_obj = {
                "id": i,                     # FAISS vector id
                "chunk_id": chunk_id,
                "row_id": str(row_id) if row_id is not None else "",
                "text": text,
                # optional placeholders if you want the schema consistent
                "problem": "",
                "solution_chunk": "",
                "expected_answer": "",
                "problem_from": "",
            }
            fmeta.write(json.dumps(meta_obj) + "\n")
            texts.append(text)
            n += 1

    with open(OUT_TEXTS_JSON, "w", encoding="utf-8") as ftxt:
        json.dump(texts, ftxt)

    print(f"✅ Wrote {n} metadata rows to {OUT_META_JSONL}")
    print(f"✅ Wrote {len(texts)} texts to {OUT_TEXTS_JSON}")

In [13]:
main()

✅ Wrote 13972791 metadata rows to /local00/student/shakya/chunk_metadata.jsonl
✅ Wrote 13972791 texts to /local00/student/shakya/chunk_texts.json


In [21]:
new_question = "The Fundamental Theorem of Arithmetic"
hits = retriever.search_and_rerank(new_question, top_k=200, top_k_final=10)

Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.70it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.91it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.74it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.11it/s]
Compute Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.18it/s]
Compute Scores: 100%|█████████████████████████████████████████████████

In [22]:
for i, r in enumerate(hits, 1):
    print(f"\n=== Reranked {i} ===")
    print("Score:", r.get("rerank_score", r["dense_score"]))
    print("Chunk ID:", r["chunk_id"])
    print("Full Problem:", (r["full_problem"] or r["problem"])[:250])
    print("Full Solution:", (r["full_solution"] or r["solution_chunk"])[:300], "...")


=== Reranked 1 ===
Score: 0.5531953573226929
Chunk ID: 7429448_0
Full Problem: Consider a sequence of angles
$$\arctan (\tan \alpha), \ \arctan (\tan 2 \alpha), \ \arctan (\tan 3 \alpha), \ \arctan (\tan t \alpha).$$
Find the least positive value of $t$ such that this sequence forms an arithmetic progression for some $\alpha$ w
Full Solution: To form an arithmetic progression, the difference between consecutive terms must be constant.

For the given sequence of angles, the terms are:
\[ \arctan (\tan \alpha), \ \arctan (\tan 2 \alpha), \ \arctan (\tan 3 \alpha), \ \arctan (\tan t \alpha) \]

The difference between the first and second te ...

=== Reranked 2 ===
Score: 0.5513421893119812
Chunk ID: 2892983_0
Full Problem: Let
\[\begin{array}{ll}
\mbox{$A=\{a_1, a_2, \cdots, a_n\}$}&\mbox{ and}\\
\mbox{$B=\{b_1, b_2, \cdots, b_n\}$}&
\end{array}\]
be two different subsets of the set of natural numbers, such that the arithmetic mean of all elements of $B$ is greater tha
Full Solution: ## 