In [17]:
import os
import pandas as pd
import glob 


DATA_DIR = os.path.expanduser("~/Desktop/CBS code/data")
PARENTS_FILE = os.path.join(DATA_DIR, "all_parents.csv")
MATCH_FILE = "trainset_reconstructed.csv"

In [18]:
# %pip install ragatouille

In [19]:
matches = pd.read_csv(MATCH_FILE).rename(columns={"%": "legacy_confidence"})
matches["legacy_confidence"] = pd.to_numeric(matches["legacy_confidence"], errors="coerce")

HIGH_CONF = 0.88
pos = matches[(matches["match"] == 1) & (matches["legacy_confidence"] >= HIGH_CONF)]

row = pos.iloc[0]
parent_id = int(row["parent_id"])
true_child_id = int(row["child_id"])

print("Parent ID:", parent_id)
print("True child ID (for evaluation):", true_child_id)

Parent ID: 674230
True child ID (for evaluation): 674464


In [20]:
parents = pd.read_csv(PARENTS_FILE)
parent_row = parents[parents["id"] == parent_id]

if parent_row.empty:
    raise ValueError("Parent not found in parents file")

# keep parent text compact (fast + works better)
parent_title = str(parent_row.iloc[0].get("title", ""))
parent_content = str(parent_row.iloc[0].get("content", ""))

parent_text = (parent_title + "\n" + parent_content).lower()
print("Parent text length:", len(parent_text))

Parent text length: 2098


In [21]:
all_files = glob.glob(os.path.join(DATA_DIR, "c_*.csv"))
child_files = [f for f in all_files if "_output" not in os.path.basename(f)]

print("Child files:", len(child_files))
print(child_files[:5])

Child files: 174190
['/Users/souadlaaziz/Desktop/CBS code/data/c_1298147.csv', '/Users/souadlaaziz/Desktop/CBS code/data/c_689286.csv', '/Users/souadlaaziz/Desktop/CBS code/data/c_932753.csv', '/Users/souadlaaziz/Desktop/CBS code/data/c_1280708.csv', '/Users/souadlaaziz/Desktop/CBS code/data/c_1347257.csv']


In [22]:
MAX_CHARS = 1500

def cut(t):
    return str(t)[:MAX_CHARS]

def read_child_text(fp):
    df = pd.read_csv(fp)

    # prefer title/content if they exist
    if "title" in df.columns and "content" in df.columns:
        t = str(df.iloc[0]["title"]) + "\n" + str(df.iloc[0]["content"])
        return cut(t).lower()

    # fallback: just take a small preview
    return cut(df.head(5).to_string(index=False)).lower()

corpus_ids = []
corpus_texts = []

# START SMALL FOR TESTING (change to None later)
LIMIT = 2000

for i, fp in enumerate(child_files[:LIMIT]):
    child_id = os.path.basename(fp).replace("c_", "").replace(".csv", "")
    corpus_ids.append(child_id)
    corpus_texts.append(read_child_text(fp))

print("Corpus size:", len(corpus_ids))
print("Example id:", corpus_ids[0])
print("Example text length:", len(corpus_texts[0]))

Corpus size: 2000
Example id: 1298147
Example text length: 1011


In [24]:
import typing_extensions as te, sys
print("python:", sys.executable)
print("file:", te.__file__)
print("has Sentinel:", hasattr(te, "Sentinel"))

python: /Users/souadlaaziz/opt/anaconda3/bin/python
file: /Users/souadlaaziz/opt/anaconda3/lib/python3.9/site-packages/typing_extensions.py
has Sentinel: False


In [23]:
from ragatouille import RAGPretrainedModel

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


ImportError: cannot import name 'Sentinel' from 'typing_extensions' (/Users/souadlaaziz/opt/anaconda3/lib/python3.9/site-packages/typing_extensions.py)

In [None]:
INDEX_NAME = "colbert_children_index"

colbert = RAGPretrainedModel.from_pretrained("jinaai/jina-colbert-v2")

if not os.path.exists(INDEX_NAME):
    colbert.index(
        collection=corpus_texts,
        document_ids=[str(x) for x in corpus_ids],
        index_name=INDEX_NAME,
        max_document_length=256,   # keep manageable on Mac
        overwrite=True
    )
    print("Built new index:", INDEX_NAME)
else:
    colbert = RAGPretrainedModel.from_index(INDEX_NAME)
    print("Loaded existing index:", INDEX_NAME)

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel


In [None]:
TOP_K = 20
results = colbert.search(query=cut(parent_text), k=TOP_K)

# results has document_id + score
results[:5]

In [None]:
out = pd.DataFrame([
    {
        "parent_id": parent_id,
        "child_id": int(r["document_id"]),
        "score": float(r["score"]),
        "rank": i + 1
    }
    for i, r in enumerate(results)
])

out_file = f"model_d_colbert_parent_{parent_id}_top{TOP_K}.csv"
out.to_csv(out_file, index=False)
print("Saved:", out_file)

out.head(10)

In [None]:
ranked_ids = [int(r["document_id"]) for r in results]

print("True child:", true_child_id)
print("In top K?", true_child_id in ranked_ids)

if true_child_id in ranked_ids:
    print("Rank:", ranked_ids.index(true_child_id) + 1)