### ColBERTv2 Implementation using RAGatouille

>[RAGatouille](https://github.com/bclavie/RAGatouille)


In [None]:
# INDEXER:  https://python.langchain.com/docs/integrations/providers/ragatouille/
# RERANKER: https://python.langchain.com/docs/integrations/retrievers/ragatouille/

In [None]:
!pip install -U ragatouille
!pip install transformers==4.49.0 # must be 4.49
!pip install beir

In [None]:
# Mount Drive

# 1. Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Find the path to your CSV file
base_folder = '/content/drive/MyDrive/colab_notebooks/RAG/'
file_path = base_folder + 'articles_big.csv'


# 3. Load the CSV file with pandas
import pandas as pd
import os

df = pd.read_csv(file_path)

# Function to convert dataframe row to Langchain Document
from langchain.docstore.document import Document
import sentence_transformers

def row_to_document(row):
    id = row["Index"]
    page_content = row['Description'] # instead of 'Article text'
    if pd.isna(page_content):
      print("no article text", row["Index"], row["Description"])
    metadata = {
        'Index': row['Index'],
        'Date published': row['Date published'],
    }
    return Document(id=id, page_content=page_content, metadata=metadata)

# Convert the dataframe to a list of Langchain Document objects
df_documents = df.apply(row_to_document, axis=1).tolist()
docs = df_documents
print("Number of documents:", len(docs))

Mounted at /content/drive
Number of documents: 37949


In [None]:
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


artifact.metadata: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler()


In [None]:
# cannot be run locally on private computer on windows

import os
if os.path.exists(".ragatouille/colbert/indexes/ragatouille_index"):
    print("Removing existing index...")
    # Remove the existing index if it exists
    import shutil
    shutil.rmtree(".ragatouille/colbert/indexes/experimental_index", ignore_errors=True)
    #os.remove(".ragatouille/colbert\indexes/ragatouille_index")


batch_size = 32
# ragatoullie only supports raw documents, so we need to convert the Langchain Documents to raw strings
raw_docs = [doc.page_content for doc in docs]
ids = [str(doc.metadata["Index"]) for doc in docs]
metadatas = [doc.metadata for doc in docs]

RAG.index(
    overwrite_index=True,
    collection=raw_docs,
    document_ids = ids,
    document_metadatas = metadatas,
    index_name="faiss_index",
    #max_document_length=256, #180 max 512
    split_documents=False,
    bsize = batch_size, # batch size for indexing
    use_faiss = True # default
)




### Save the Index for later use (indexing takes a long time)

In [None]:
# Define the source and destination paths
source_path = ".ragatouille/colbert/indexes/faiss_index"
destination_path = base_folder + "colbert_indices"

# Use the cp command to copy the directory
!cp -r "{source_path}" "{destination_path}"

print(f"Index copied from {source_path} to {destination_path}")

### Load the previously saved index instead of creating it again

In [None]:
from ragatouille import RAGPretrainedModel

# Assuming you saved your index to this path in your Google Drive
#index_name = 'ragatouille_index'    # PLAID
index_name = 'faiss_index'   # FAISS
RAG = RAGPretrainedModel.from_index(base_folder + 'colbert_indices/' + index_name)


********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


artifact.metadata: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler()


We can then convert easily to a LangChain retriever! We can pass in any kwargs we want when creating (like `k`)

In [None]:
retriever = RAG.as_langchain_retriever(k=10)

# Rest of the Evaluation is basically identical to evaluate_retrievers

In [None]:
# combine qrels from multiple topics into a single dictionary
# each topic is its own folder with a qrels.csv and a queries.csv file

from typing import Dict, Any
import csv

topics_folder = "topics"  # Folder containing topic subfolders with qrels and queries

def get_topic_folders(topics_folder):
    import os
    topic_folders = []
    for folder in os.listdir(topics_folder):
        folder_path = os.path.join(topics_folder, folder)
        if os.path.isdir(folder_path):
            topic_folders.append(folder_path)
    return topic_folders

def load_queries(queries_csv):
    queries = {}
    with open(queries_csv, newline='') as f:
        reader = csv.DictReader(f)
        for row in list(reader):
            queries[row['query_id']] = row['query']
    return queries

def load_qrels(qrels_csv):
    qrels = {}
    with open(qrels_csv, newline='') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            qid, _, doc_id, relevance = row
            qrels.setdefault(qid, {})[doc_id] = int(relevance)
    return qrels


# combine qrels and queries from multiple topics into a single dictionary
# the qid has to be unique across topics, so we prefix it with the folder name

def combine_topics(topic_folders: list[str]) -> Dict[str, Any]:
    all_queries = {}
    all_qrels = {}
    for folder in topic_folders:
        print(f"Processing topic folder: {folder}")
        queries_csv = f"{folder}/queries.csv"
        qrels_csv = f"{folder}/qrels.csv"

        # Load queries and qrels for the current topic
        queries = load_queries(queries_csv)
        qrels = load_qrels(qrels_csv)

        # qrels ids start at 1 for each topic, so we need to adjust them

        for qid, query in queries.items():
            # Adjust qid to be unique across topics
            unique_qid = f"{folder}_{qid}"
            all_queries[unique_qid] = query

        for qid, docs in qrels.items():
            unique_qid = f"{folder}_{qid}"
            for doc_id, relevance in docs.items():
                # Ensure the qid is unique across topics
                all_qrels.setdefault(unique_qid, {})[doc_id] = relevance

    return all_queries, all_qrels

def save_combined_topics(all_queries: Dict[str, str], all_qrels: Dict[str, Dict[str, int]]):
    import os

    # Save combined queries
    queries_csv = "combined_queries.csv"
    with open(queries_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['query_id', 'query'])
        for qid, query in all_queries.items():
            writer.writerow([qid, query])

    # Save combined qrels
    qrels_csv = "combined_qrels.csv"
    with open(qrels_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        for qid, docs in all_qrels.items():
            for doc_id, relevance in docs.items():
                writer.writerow([qid, '0', doc_id, relevance])

#topics = get_topic_folders(topics_folder)
#all_queries, all_qrels = combine_topics(topics)
#save_combined_topics(all_queries, all_qrels)

In [None]:
def get_document_by_id(retriever, doc_id):
    # For FAISS (has .docstore)
    doc = None
    if not hasattr(retriever, "vectorstore"):
        doc = next((d for d in df_documents if str(d.id) == str(doc_id)), None) ## global docs
    else:
        vectorstore = retriever.vectorstore

        if hasattr(vectorstore, "docstore"):
            docstore = vectorstore.docstore
            # InMemoryDocstore is not subscriptable, use .search or ._dict
            if hasattr(docstore, "_dict"):
                # Try int and str keys
                doc = docstore._dict.get(doc_id) or docstore._dict.get(str(doc_id))
            else:
                try:
                    doc = docstore[doc_id]
                except KeyError:
                    try:
                        doc = docstore[str(doc_id)]
                    except KeyError:
                        doc = None

    # For InMemoryVectorStore (has .documents or ._documents)
        elif hasattr(vectorstore, "get_by_ids"):
            docs = vectorstore.get_by_ids([doc_id])
            doc = docs[0] if docs else None
    if doc is None:
        print(f"Document with ID {doc_id} not found in the vector store.")
        return None
    else:
        return doc

  # Evaluate the retriever using BEIR's evaluation framework
# based on manually defined queries, qrels and the retriever's results

from beir.retrieval.evaluation import EvaluateRetrieval
from typing import Dict, Any
import csv

print_length = 500  # how many characters of the document to print

def evaluate_langchain_retriever(retriever, queries_csv: str, qrels_csv: str, k_values=[1, 3, 5, 10], score_function=None, verbose=True ) -> Dict[str, Any]:
    """
    Evaluate LangChain retriever using standard IR metrics (CSV version).

    Parameters:
        retriever: LangChain retriever with .get_relevant_documents(query)
        queries_csv: Path to CSV with 'qid', 'query'
        qrels_file: CSV/TSV with 'qid', '0', 'doc_id', 'relevance'
        k_values: List of K values (e.g., [1, 3, 5, 10])
        score_function: Optional function to retrieve the actual scores for each document

    Returns:
        Dicts: ndcg, recall, precision, map
    """
    # --- Load queries from CSV
    queries = load_queries(queries_csv)
    qrels = load_qrels(qrels_csv)

    # --- Retrieve for each query
    results = {}
    contents = {}
    for qid, query in queries.items():

        if score_function: # Use the provided score function to retrieve documents with scores
            docs = score_function(query, k = k_values[-1])
        else: # Use the default retriever function to get documents
            docs = retriever.invoke(query)

        result_for_q = {}
        content_for_q = {}
        for idx, doc in enumerate(docs):
            if isinstance(doc, tuple):
                doc, score = doc
                # TODO: score must be descending, eucledian distance is ascending
                # -> score = -score
            else:
                score = len(docs)-idx # no score available, e.g. for BM25Retriever

            doc_id = doc.metadata["Index"]

            if doc_id:
                result_for_q[str(doc_id)] = float(score)
                content_for_q[str(doc_id)] = {
                    "page_content": doc.page_content[:print_length],
                    "scores": {
                        "combined": doc.metadata.get("combined_score", None),
                        "default": doc.metadata.get("default_score", None),
                        "temporal": doc.metadata.get("temporal_score", None),
                    }
                }

        results[qid] = result_for_q
        contents[qid] = content_for_q

    # --- Print queries, qrels and results
    if verbose:
        for qid in queries:
            print(f"\nQuery ID: {qid}, Query: {queries[qid]}")
            qrels_for_q = qrels.get(qid, {})
            print("Qrels for query:")
            for doc_id, relevance in list(qrels_for_q.items())[:k_values[-1]]:
                qrel_doc = get_document_by_id(retriever, doc_id)
                if qrel_doc is not None:
                    print(f"  Doc ID: {doc_id}, Relevance: {relevance}  text: {qrel_doc.page_content[:print_length]}...")
                else:
                    print(f"  Doc ID: {doc_id}, Relevance: {relevance}  text: [Document not found]")
            print("Retrieved documents:")
            results_for_q = results.get(qid, {})
            content_for_q = contents.get(qid, {})

            for doc_id, score in list(results_for_q.items())[:k_values[-1]]:  # Show top 5 results"
                content = content_for_q[doc_id]
                scores = content.get("scores", {})
                if scores.get("combined") is not None:
                    print(f"  Doc ID: {doc_id}, Score: {score:.4f}, Combined: {scores['combined']:.4f} Default: {scores['default']:.4f}, Temporal: {scores['temporal']:.4f}  text: {content['page_content'][:print_length]}...")
                else:
                    print(f"  Doc ID: {doc_id}, Score: {score:.4f}  text: {content['page_content'][:print_length]}...")


    # --- Evaluate
    evaluator = EvaluateRetrieval(k_values=k_values)
    ndcg, _map, recall, precision = evaluator.evaluate(qrels, results, k_values=k_values)

    # --- Print summary
    for k in k_values:
        print(
            f"Top-{k}:  "
            f"nDCG={ndcg[f'NDCG@{k}']:.4f} | "
            f"Recall={recall[f'Recall@{k}']:.4f} | "
            f"Precision={precision[f'P@{k}']:.4f} | "
            f"MAP={_map[f'MAP@{k}']:.4f}"
        )
    return {"nDCG": ndcg, "Recall": recall, "Precision": precision, "MAP": _map}



In [None]:
results = evaluate_langchain_retriever(
    retriever=retriever,
    score_function=None,
    queries_csv= base_folder + "combined_queries.csv",
    qrels_csv= base_folder + "combined_qrels.csv",
    k_values=[1, 3, 5, 10],
)
print(f"Results for colbert retriever:", results)

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
  return torch.cuda.amp.autocast() if self.activated


Query ID: topics\hurricanes_1, Query: Which hurricane was active in Central America on 17 Nov 2020?
Qrels for query:
  Doc ID: 503, Relevance: 1  text: Hurricane Iota slammed into an already storm-ravaged Nicaragua as a Category 4 storm November 17th.  The landfall was just 15 miles south of where Hurricane Eta hit two weeks earlier. Eta left thousands displaced and dozens killed in Central America....
  Doc ID: 11012, Relevance: 1  text: Iota is now a tropical storm, but the Tuesday afternoon downgrade will do nothing to minimize the devastation it brought when it barreled into a Central America still recovering from Eta on Tuesday....
  Doc ID: 11013, Relevance: 0  text: Central America's battle with Hurricane Eta could leave some countries scarred for generations....
  Doc ID: 14482, Relevance: 0  text: Hurricane Delta slammed Mexico and the U.S. Gulf Coast in the same southwestern part of Louisiana, where Hurricane Laura barreled ashore, leaving a trail of destruction six weeks ea

In [None]:
!ls -l .ragatouille/colbert/indexes/ragatouille_index

total 0


In [None]:
def compare_retrievers_on_groups(
    retrievers,
    queries_csv,
    qrels_csv,
    k_values=[1, 3, 5, 10],
    sort_by_difficulty=False,
):
    """
    Directly compare multiple retrievers on each individual query.

    Args:
        retrievers: Dict mapping retriever names to (retriever, score_function) tuples
        queries_csv: Path to CSV with queries
        qrels_csv: Path to CSV with relevance judgments
        k_values: List of k values to evaluate

    Returns:
        Detailed comparison of retrievers on each query
    """
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    import seaborn as sns

    # Load queries and qrels
    queries = load_queries(queries_csv)
    qrels = load_qrels(qrels_csv)

    # Store results for each retriever on each query
    retriever_query_results = {}
    evaluator = EvaluateRetrieval(k_values=k_values)

    # For each retriever, get results for each query
    for retriever_name, (retriever, score_function) in retrievers.items():
        print(f"Getting results for {retriever_name}...")

        # Store query results for this retriever
        query_results = {}
        all_retrieved_docs = {}

        for qid, query in queries.items():

            # Get retrieved documents
            if score_function:
                docs = score_function(query, k=k_values[-1])
            else:
                docs = retriever.invoke(query, k=k_values[-1])

            # Convert to format required by evaluator
            result_for_query = {}
            retrieved_docs_for_query = {}

            for idx, doc in enumerate(docs):
                if isinstance(doc, tuple):
                    doc, score = doc
                else:
                    score = len(docs) - idx

                doc_id = str(doc.metadata["Index"])

                if doc_id:
                    result_for_query[doc_id] = float(score)
                    retrieved_docs_for_query[doc_id] = {
                        "content": doc.page_content[:200],  # Truncate for display
                        "position": idx + 1,
                        "relevance": qrels.get(qid, {}).get(doc_id, 0)  # 0 if not relevant
                    }

            # Store results for this query
            query_results[qid] = result_for_query
            all_retrieved_docs[(retriever_name, qid)] = retrieved_docs_for_query

        # Evaluate retriever on all queries
        ndcg, _map, recall, precision = evaluator.evaluate(qrels, query_results, k_values=k_values)

        # Get per-query metrics (this is the key part missing before)
        all_metrics = {}
        for qid in queries:
            # Create qrels and results dictionaries for just this query
            query_qrels = {qid: qrels.get(qid, {})}
            query_result = {qid: query_results.get(qid, {})}

            # Evaluate just this query
            try:
                q_ndcg, q_map, q_recall, q_precision = evaluator.evaluate(
                    query_qrels, query_result, k_values=k_values
                )

                all_metrics[qid] = {
                    'nDCG': q_ndcg,
                    'MAP': q_map,
                    'Recall': q_recall,
                    'Precision': q_precision
                }
            except Exception as e:
                print(f"    Error evaluating query {qid}: {e}")
                all_metrics[qid] = None

        # Store all results for this retriever
        retriever_query_results[retriever_name] = {
            'overall': {
                'nDCG': ndcg,
                'MAP': _map,
                'Recall': recall,
                'Precision': precision
            },
            'per_query': all_metrics,
            'retrieved_docs': all_retrieved_docs
        }

    # Create comparison visualizations
    print("\nGenerating comparison visualizations...")

    # 1. Bar chart comparing retrievers on each query
    top_k = k_values[-1]  # Use the largest k for visualization
    metric = 'nDCG'  # Primary metric


    # 2. Create a heatmap showing which retriever is best for each query
    # Reshape data for heatmap
    query_ids = list(queries.keys())
    retriever_names = list(retrievers.keys())

    # Calculate average score per query across all retrievers
    group_avg_scores = {}
    group_results = {}
    def idfn(pos, grp):
        return query_ids[( ( pos + grp * 2 ) - 1 ) % 36]
    grouped_query_ids = { }
    for i in range(0,6):
        grouped_query_ids[i] = [ idfn(1,i), idfn(2,i), idfn(13,i), idfn(14,i), idfn(25, i), idfn(26,i) ]

    print(query_ids)
    print("grouped ids", grouped_query_ids)
    for grp, qids in grouped_query_ids.items():

        group_results[grp] = {}
        for retriever_name in retriever_names:
            scores = []
            for qid in qids:
                try:
                    score = retriever_query_results[retriever_name]['per_query'][qid][metric][f'NDCG@{top_k}']
                    scores.append(score)
                except:
                    print("unable to get score for retriever", retriever_name, "for query", qid)
                    pass
            # Calculate average score for this retriever

            if scores:
                avg_score = sum(scores) / len(scores)
                group_results[grp][retriever_name] = avg_score
            else:
                print("no scores for group", grp, "retriever", retriever_name)
                group_results[grp][retriever_name] = 0
            print(scores)
            print("group", grp, retriever_name, sum(scores), len(scores), sum(scores) / len(scores))

        print("AVG GROUP", grp, sum(group_results[grp].values()), len(group_results[grp]))
        group_avg_scores[grp] = sum(group_results[grp].values()) / len(group_results[grp])




    # Create heatmap data matrix with extra column for average score
    heatmap_data = np.zeros((len(group_avg_scores), len(retriever_names) + 1))
    print(group_results)
    for grp, _ in enumerate(group_results):
        print(grp, group_results[grp])
        # Add individual retriever scores
        for j, retriever_name in enumerate(retriever_names):
            try:
                heatmap_data[grp, j] = group_results[grp][retriever_name]
            except:
                heatmap_data[grp, j] = 0

        # Add the average score in the last column
        heatmap_data[grp, -1] = group_avg_scores.get(grp, 0)

    # Create heatmap with combined score column
    plt.figure(figsize=(14, 10))

    # Create column labels with retrievers + "Average" for the last column
    col_labels = retriever_names + ["Average"]

    # Create row labels with query text and sorted by difficulty
    row_labels = [f"{grp}" for grp in group_results.keys()]

    # Create the heatmap
    sns.heatmap(
        heatmap_data,
        annot=True,
        fmt=".2f",
        cmap="YlGnBu",
        xticklabels=col_labels,
        yticklabels=row_labels
    )
    plt.title(f'Retriever Performance by Query ({metric}@{top_k}) - Sorted by Difficulty')
    plt.tight_layout()
    plt.show()

    return

retrievers = {}
retrievers["Colbert"] = ( retriever, None)

kvals = [1, 3, 5, 10]
results = compare_retrievers_on_groups(
    retrievers=retrievers,
    queries_csv= base_folder + "combined_queries.csv",
    qrels_csv= base_folder + "combined_qrels.csv",
    k_values=kvals,
    sort_by_difficulty=True
)

results = compare_retrievers_on_queries(
    retrievers=retrievers,
    queries_csv= base_folder + "combined_queries.csv",
    qrels_csv= base_folder + "combined_qrels.csv",
    k_values=kvals,
    sort_by_difficulty=False
)