In [2]:
import os
import datasets
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from lancedb.table import Table
from typing import Optional
from datetime import datetime, timezone
import cohere
from tqdm import tqdm
import json

def get_or_create_lancedb_table(db: Table, table_name: str, embedding_model: str = "embed-english-v3.0"):
    """Create or get existing LanceDB table with mining equipment schema"""
    
    if table_name in db.table_names():
        print(f"Table {table_name} already exists")
        return db.open_table(table_name)

    # Get Cohere embedding function
    func = get_registry().get("cohere").create(
        model=embedding_model,
        api_key=os.environ.get("COHERE_API_KEY")
    )

    class MiningDocumentChunk(LanceModel):
        chunk_id: str
        text: str = func.SourceField()  # Field to be embedded
        vector: Vector(func.ndims()) = func.VectorField()
        equipment_model: str
        equipment_type: str
        document_type: str
        section: Optional[str]
        created_at: datetime = datetime.now(timezone.utc)
    # Create table with our schema
    table = db.create_table(table_name, schema=MiningDocumentChunk, mode="overwrite")

    # Create full-text search index
    table.create_fts_index("text", replace=True)
    print(f"Created new table {table_name} with schema for mining equipment documents")
    
    return table

def ingest_chunks_to_table(table: Table, chunks: list[dict], batch_size: int = 32) -> Table:
    """Ingest document chunks into LanceDB table with batching"""
    
    formatted_chunks = []
    for i in tqdm(range(0, len(chunks), batch_size), desc="Processing chunks"):
        batch = chunks[i:i + batch_size]
        batch_formatted = [
            {
                "chunk_id": f"{chunk['equipment_model']}-{chunk['document_type']}-{id(chunk)}",
                "text": chunk['text'],
                "equipment_model": chunk['equipment_model'],
                "equipment_type": chunk['equipment_type'],
                "document_type": chunk['document_type'],
                "section": chunk.get('section'),
                "created_at": datetime.now(timezone.utc)
            }
            for chunk in batch
        ]
        # Add batch to table
        table.add(batch_formatted)
        formatted_chunks.extend(batch_formatted)

    print(f"{table.count_rows()} chunks ingested into the database")
    return table

In [3]:
# Create LanceDB Instance
db = lancedb.connect("./mining_equipment_db")

# Create tables with different Cohere embedding models
table = get_or_create_lancedb_table(   
    db, "mining_docs_embed_english_v3", "embed-english-v3.0"
)

chunks = json.loads(open("data/technical_manuals.json", "r").read())

# Ingest chunks with batching
table_english = ingest_chunks_to_table(table, chunks)

Table mining_docs_embed_english_v3 already exists


Processing chunks: 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]

361 chunks ingested into the database





In [4]:
# Example search with Cohere embeddings
results = table_english.search("maximum load capacity").limit(5).to_list()
print(results)
print("\nSearch Results:")
for result in results:
    print(f"Equipment: {result['equipment_model']}")
    print(f"Text: {result['text'][:100]}...")
    print(f"Score: {result['_distance']}\n")

[{'chunk_id': 'LH517i-specs-4765981248', 'text': 'Maximum payload capacity varies based on grade conditions. On level ground, the LH517i loader...', 'vector': [0.2471923828125, -0.70263671875, -0.11053466796875, 0.1614990234375, 0.572265625, 0.4033203125, 0.172607421875, -0.366943359375, -0.07391357421875, 0.225830078125, -0.252685546875, -0.296142578125, 0.008026123046875, -0.0499267578125, 0.153564453125, -0.34326171875, 0.611328125, -0.09130859375, -0.426513671875, 0.0308074951171875, -0.25732421875, 0.252197265625, -0.01166534423828125, 0.30615234375, 0.1845703125, 0.25927734375, -0.146728515625, 0.035736083984375, -0.26904296875, 0.1539306640625, 0.03631591796875, -0.3330078125, -0.47607421875, 0.5263671875, -0.51513671875, 0.3291015625, 0.052520751953125, 0.0009412765502929688, -0.1510009765625, -0.11370849609375, -0.08331298828125, 0.414794921875, 0.1126708984375, 0.2286376953125, 0.0738525390625, 0.137939453125, -0.307861328125, -0.053466796875, 0.034576416015625, -0.0651245117

In [5]:
def calculate_mrr(predictions: list[str], gt: list[str]):
    mrr = 0
    for label in gt:
        if label in predictions:
            # Find the relevant item that has the smallest index
            mrr = max(mrr, 1 / (predictions.index(label) + 1))
    return mrr


def calculate_recall(predictions: list[str], gt: list[str]):
    # Calculate the proportion of relevant items that were retrieved
    return len([label for label in gt if label in predictions]) / len(gt)

In [6]:
from braintrust import Score
from lancedb.rerankers import CohereReranker
import lancedb
from lancedb.table import Table
from lancedb.rerankers import Reranker
from typing import Literal, Optional

db = lancedb.connect("./mining_equipment_db")
# Define Our Metrics
metrics = [("recall", calculate_recall), ("mrr", calculate_mrr)]
k = [1, 3, 5, 10, 15, 20, 25, 30, 35, 40]


def retrieve(
    question: str,
    table: Table,
    max_k=25,
    mode: Literal["vector", "fts", "hybrid"] = "vector",
    reranker: Optional[Reranker] = None,
    hooks=None,
):
    results = table.search(question, query_type=mode).limit(max_k)
    if reranker:
        results = results.rerank(reranker=reranker)
    return [
        {"id": result["chunk_id"], "query": result["text"]} for result in results.to_list()
    ]


# Similar to our previous section, we can use the id of each item to compute the recall and MRR metrics.
def evaluate_braintrust(input, output, **kwargs):
    # Debug prints
    print("Predictions:", [item["id"] for item in output])  # First 3 predictions
    print("Ground Truth:", kwargs["metadata"]["chunk_id"])
    
    predictions = [item["id"] for item in output]
    labels = [kwargs["metadata"]["chunk_id"]]

    scores = []
    for metric, score_fn in metrics:
        for subset_k in k:
            score = score_fn(predictions[:subset_k], labels)
            print(f"{metric}@{subset_k}: {score}")  # Debug print
            scores.append(
                Score(
                    name=f"{metric}@{subset_k}",
                    score=score,
                    metadata={"query": input, "result": output, **kwargs["metadata"]},
                )
            )

    return scores

In [7]:
def task(query):
    # Access the query text from the 'input' field
    query_text = query['input']
    
    # Get the expected chunk_id from metadata
    expected_chunk_id = query['metadata']['chunk_id']
    
    # Run your retrieval on the query text
    results = retrieve(query_text, table)
    
    # Get the retrieved chunk IDs
    retrieved_chunk_ids = [r.id for r in results]
    
    # Check if the expected chunk_id is in the retrieved results
    correct = expected_chunk_id in retrieved_chunk_ids
    
    return {
        'correct': correct,
        'retrieved': retrieved_chunk_ids,
        'expected': expected_chunk_id
    }

In [8]:
from braintrust import init_dataset, Eval
from itertools import product
import uuid


# Load subset of evaluation queries
evaluation_queries = [
    item for item in init_dataset(project="industrial_rag", name="Equipment-Questions-V1")
]

# Evaluation configurations
available_rerankers = {
    "rerank-english-v3.0": CohereReranker(
        model_name="rerank-english-v3.0", column="text"
    ),
    "none": None,
}

search_query_modes = ["hybrid", "vector"]

embedding_model_to_table = {
    "mining_docs_embed_english_v3": table,
}

# Run evaluations
evaluation_results = []
experiment_id = str(uuid.uuid4())


for reranker_name, search_mode, embedding_model in product(
    available_rerankers, search_query_modes, embedding_model_to_table
):
    # Get model instances
    current_reranker = available_rerankers[reranker_name]
    current_table = embedding_model_to_table[embedding_model]
    print(current_reranker, current_table)

    # Configure retrieval size
    retrieval_limit = 40 if current_reranker else 20

    # Run evaluation
    benchmark_result = await Eval(
        name="industrial_rag_benchmark",
        experiment_name=f"{experiment_id}-{reranker_name}-{search_mode}-{embedding_model}",
        task=lambda query: retrieve(
            question=query,
            max_k=retrieval_limit,
            table=current_table,
            mode=search_mode,
            reranker=current_reranker,
        ),
        data=evaluation_queries,
        scores=[evaluate_braintrust],
        metadata={
            "embedding_model": embedding_model,
            "reranker": reranker_name,
            "query_mode": search_mode,
            "retrieval_limit": retrieval_limit,
        },
    )

    # Process benchmark results
    performance_scores = benchmark_result.summary.scores
    for metric_name, score_data in performance_scores.items():
        metric_type, top_k = metric_name.split("@")
        evaluation_results.append(
            {
                "metric": metric_type,
                "k": int(top_k),
                "reranker": reranker_name,
                "embedding_model": embedding_model,
                "query_type": search_mode,
                "score": score_data.score,
            }
        )

<lancedb.rerankers.cohere.CohereReranker object at 0x132a04410> LanceTable(connection=LanceDBConnection(/Users/shubham/Desktop/industrial-rag/mining_equipment_db), name="mining_docs_embed_english_v3")


fatal: Not a valid object name HEAD^
Experiment 4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-hybrid-mining_docs_embed_english_v3 is running at https://www.braintrust.dev/app/shubham/p/industrial_rag_benchmark/experiments/4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-hybrid-mining_docs_embed_english_v3
`Eval()` was called from an async context. For better performance, it is recommended to use `await EvalAsync()` instead.
industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-hybrid-mining_docs_embed_english_v3] (data): 81it [00:00, 119795.00it/s]


industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-hybrid-mini…

Predictions: ['MT65-manual-5711663808', 'MT65-manual-5024165888', 'MT65-manual-5219191936', 'MT65-manual-5148335808', 'MT65-manual-4802856768', 'MT65-specs-5035165376', 'MT65-specs-4775725504', 'MT65-specs-5156492416', 'MT65-specs-5219601280', 'MT65-specs-5714011072', 'MT65-manual-4766005248', 'MT65-manual-5698572288', 'MT65-manual-4418971648', 'MT65-manual-4395966400', 'MT65-manual-4384594880', 'MT65-safety-4418966016', 'MT65-safety-4384508480', 'MT65-safety-4395960064', 'MT65-safety-5698830400', 'MT65-safety-4802911872', 'MT42-specs-5156495488', 'MT42-specs-5714007040', 'MT42-specs-5219604352', 'MT42-specs-5035168448', 'MT42-specs-4802904320', 'TH320-operation-5035097728', 'TH320-operation-5711668544', 'TH320-operation-5219599232', 'TH320-operation-5156113408', 'TH320-operation-4802868480', 'TH551i-manual-5698831168', 'TH551i-manual-4802910720', 'TH551i-manual-5144336128', 'TH551i-manual-5219293120', 'TH551i-manual-5034890752', 'TH551i-manual-5034449536', 'TH551i-manual-4395687872', 

fatal: Not a valid object name HEAD^
Experiment 4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-vector-mining_docs_embed_english_v3 is running at https://www.braintrust.dev/app/shubham/p/industrial_rag_benchmark/experiments/4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-vector-mining_docs_embed_english_v3
industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-vector-mining_docs_embed_english_v3] (data): 81it [00:00, 138217.50it/s]


industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-rerank-english-v3.0-vector-mini…

Predictions: ['MT42-specs-4802904320', 'MT42-specs-5219604352', 'MT42-specs-5156495488', 'MT42-specs-5035168448', 'MT42-specs-5714007040', 'MT42-specs-4384518976', 'MT42-specs-4418895360', 'MT42-specs-4802863168', 'MT42-specs-4401994944', 'MT42-specs-5695531520', 'MT42-specs-5219569152', 'MT42-specs-5156369216', 'MT42-specs-4802911744', 'MT42-specs-5698846144', 'MT42-specs-5035084224', 'MT65-manual-5219191936', 'MT65-manual-5711663808', 'MT65-manual-5148335808', 'MT65-manual-5024165888', 'MT65-manual-4802856768', 'MT65-specs-5156492416', 'MT65-specs-5219601280', 'MT65-specs-5714011072', 'MT65-specs-5035165376', 'MT65-specs-4775725504', 'TH320-operation-5219599232', 'TH320-operation-5711668544', 'TH320-operation-5035097728', 'TH320-operation-4802868480', 'TH320-operation-5156113408', 'TH551i-manual-5034890752', 'TH551i-manual-4802910720', 'TH551i-manual-5144336128', 'TH551i-manual-5698831168', 'TH551i-manual-5219293120', 'TH551i-manual-4802861568', 'TH551i-manual-4418893824', 'TH551i-ma

fatal: Not a valid object name HEAD^
Experiment 4dc9386d-0044-46e7-844d-13936a1244d2-none-hybrid-mining_docs_embed_english_v3 is running at https://www.braintrust.dev/app/shubham/p/industrial_rag_benchmark/experiments/4dc9386d-0044-46e7-844d-13936a1244d2-none-hybrid-mining_docs_embed_english_v3
industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-none-hybrid-mining_docs_embed_english_v3] (data): 81it [00:00, 71238.96it/s]


industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-none-hybrid-mining_docs_embed_e…

Predictions: ['MT42-specs-4802911744', 'MT42-specs-5698846144', 'MT42-specs-5035084224', 'MT42-specs-5156369216', 'MT42-specs-5219569152', 'TH551i-safety-4775724800', 'TH551i-safety-5156491392', 'TH551i-safety-5219600256', 'TH551i-safety-5711660928', 'TH551i-safety-5035164352', 'MT42-specs-5695531520', 'MT42-specs-4802863168', 'MT42-specs-4401994944', 'MT42-specs-4384518976', 'MT42-specs-4418895360', 'TH663i-maintenance-4773103808', 'TH663i-maintenance-4773104704', 'TH663i-maintenance-4775733824', 'TH545i-safety-4395967232', 'TH545i-safety-5221977600']
Ground Truth: MT42-specs-4833102464
recall@1: 0.0
recall@3: 0.0
recall@5: 0.0
recall@10: 0.0
recall@15: 0.0
recall@20: 0.0
recall@25: 0.0
recall@30: 0.0
recall@35: 0.0
recall@40: 0.0
mrr@1: 0
mrr@3: 0
mrr@5: 0
mrr@10: 0
mrr@15: 0
mrr@20: 0
mrr@25: 0
mrr@30: 0
mrr@35: 0
mrr@40: 0
Predictions: ['MT42-specs-5156495488', 'MT42-specs-5035168448', 'MT42-specs-5219604352', 'MT42-specs-5714007040', 'MT42-specs-4802904320', 'MT65-manual-514833580

fatal: Not a valid object name HEAD^
Experiment 4dc9386d-0044-46e7-844d-13936a1244d2-none-vector-mining_docs_embed_english_v3 is running at https://www.braintrust.dev/app/shubham/p/industrial_rag_benchmark/experiments/4dc9386d-0044-46e7-844d-13936a1244d2-none-vector-mining_docs_embed_english_v3
industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-none-vector-mining_docs_embed_english_v3] (data): 81it [00:00, 152897.67it/s]


industrial_rag_benchmark [experiment_name=4dc9386d-0044-46e7-844d-13936a1244d2-none-vector-mining_docs_embed_e…

Predictions: ['MT42-specs-4802911744', 'MT42-specs-5035084224', 'MT42-specs-5698846144', 'MT42-specs-5156369216', 'MT42-specs-5219569152', 'TH551i-safety-5156491392', 'TH551i-safety-5711660928', 'TH551i-safety-5219600256', 'TH551i-safety-5035164352', 'TH551i-safety-4775724800', 'MT42-specs-4384518976', 'MT42-specs-4418895360', 'MT42-specs-4401994944', 'MT42-specs-5695531520', 'MT42-specs-4802863168', 'TH663i-maintenance-4773103808', 'TH663i-maintenance-4775733824', 'TH663i-maintenance-4773104704', 'TH545i-safety-4418888128', 'TH545i-safety-4395967232']
Ground Truth: MT42-specs-4833102464
recall@1: 0.0
recall@3: 0.0
recall@5: 0.0
recall@10: 0.0
recall@15: 0.0
recall@20: 0.0
recall@25: 0.0
recall@30: 0.0
recall@35: 0.0
recall@40: 0.0
mrr@1: 0
mrr@3: 0
mrr@5: 0
mrr@10: 0
mrr@15: 0
mrr@20: 0
mrr@25: 0
mrr@30: 0
mrr@35: 0
mrr@40: 0
Predictions: ['MT42-specs-5156495488', 'MT42-specs-5035168448', 'MT42-specs-5219604352', 'MT42-specs-5714007040', 'MT42-specs-4802904320', 'MT65-manual-514833580

In [9]:
for i in product(
    available_rerankers, search_query_modes, embedding_model_to_table
):
    print(i)

('rerank-english-v3.0', 'hybrid', 'mining_docs_embed_english_v3')
('rerank-english-v3.0', 'vector', 'mining_docs_embed_english_v3')
('none', 'hybrid', 'mining_docs_embed_english_v3')
('none', 'vector', 'mining_docs_embed_english_v3')


In [10]:
result

{'chunk_id': 'TH320-operation-4802868480',
 'text': 'The TH320 truck’s load-sensing hydraulics provide improved efficiency by optimizing power delivery based on load requirements.',
 'vector': [0.45703125,
  -0.1614990234375,
  0.01375579833984375,
  -0.2347412109375,
  0.08624267578125,
  0.47705078125,
  0.299072265625,
  -0.274658203125,
  0.1171875,
  0.310546875,
  -0.638671875,
  -0.1807861328125,
  0.041717529296875,
  0.441650390625,
  -0.08966064453125,
  -0.1368408203125,
  0.6630859375,
  0.0259552001953125,
  -0.137451171875,
  0.055877685546875,
  0.10577392578125,
  0.1729736328125,
  0.49072265625,
  -0.053192138671875,
  0.1422119140625,
  -0.017364501953125,
  -0.388916015625,
  0.1094970703125,
  0.09417724609375,
  0.320556640625,
  0.136962890625,
  -0.45263671875,
  -0.26220703125,
  0.432861328125,
  -0.88525390625,
  0.383544921875,
  -0.07489013671875,
  0.2120361328125,
  0.01422882080078125,
  0.24755859375,
  -0.34130859375,
  0.435791015625,
  -0.36791992187

In [11]:
# Debug evaluation queries
print("Sample evaluation query:")
print(evaluation_queries[0])

# Check the metadata structure
print("\nMetadata structure:")
for key in evaluation_queries[0].metadata.keys():
    print(f"- {key}")

Sample evaluation query:
{'id': '0299c902-f174-48d8-b83c-6d27225de27a', '_xact_id': '1000194321695049818', 'created': '2024-12-28T08:12:45.220Z', 'project_id': 'c8441880-33fc-49b7-9076-e994aaddb490', 'dataset_id': '9c0a62ae-e3e2-4c8d-b045-316ddca42fe2', 'input': 'How does the braking system of the MT42 truck ensure safe operation, particularly when navigating steep inclines?', 'expected': ['The MT42 truck’s advanced braking system includes wet disc brakes for maximum stopping power, even on steep grades.'], 'metadata': {'chunk': 'The MT42 truck’s advanced braking system includes wet disc brakes for maximum stopping power, even on steep grades.', 'chunk_id': 'MT42-specs-4833102464', 'document_type': 'specs', 'equipment_type': 'truck', 'equipment_model': 'MT42'}, 'tags': None, 'span_id': '03d60427-2068-4f61-9817-68f611f9d948', 'root_span_id': '03d60427-2068-4f61-9817-68f611f9d948', 'is_root': True, 'origin': None}

Metadata structure:


AttributeError: 'dict' object has no attribute 'metadata'

In [20]:
print(evaluation_queries)

[{'id': '0299c902-f174-48d8-b83c-6d27225de27a', '_xact_id': '1000194321695049818', 'created': '2024-12-28T08:12:45.220Z', 'project_id': 'c8441880-33fc-49b7-9076-e994aaddb490', 'dataset_id': '9c0a62ae-e3e2-4c8d-b045-316ddca42fe2', 'input': 'How does the braking system of the MT42 truck ensure safe operation, particularly when navigating steep inclines?', 'expected': ['The MT42 truck’s advanced braking system includes wet disc brakes for maximum stopping power, even on steep grades.'], 'metadata': {'chunk': 'The MT42 truck’s advanced braking system includes wet disc brakes for maximum stopping power, even on steep grades.', 'chunk_id': 'MT42-specs-4833102464', 'document_type': 'specs', 'equipment_type': 'truck', 'equipment_model': 'MT42'}, 'tags': None, 'span_id': '03d60427-2068-4f61-9817-68f611f9d948', 'root_span_id': '03d60427-2068-4f61-9817-68f611f9d948', 'is_root': True, 'origin': None}, {'id': '02b82783-927c-4743-b682-1b4a923e1ea6', '_xact_id': '1000194321695049818', 'created': '202