### Install and Initialize RapidFire AI

In [1]:
try:
    import rapidfireai
    print("‚úÖ rapidfireai already installed")
except ImportError:
    !pip install rapidfireai  # Takes 1 min
    !rapidfireai init --evals # Takes 1 min

Collecting rapidfireai
  Downloading rapidfireai-0.12.8-py3-none-any.whl.metadata (24 kB)
Collecting flask-cors>=6.0.1 (from rapidfireai)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting waitress>=3.0.2 (from rapidfireai)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting jq>=1.10.0 (from rapidfireai)
  Downloading jq-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting jedi>=0.16 (from rapidfireai)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting uv>=0.8.14 (from rapidfireai)
  Downloading uv-0.9.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfireai-0.12.8-py3-none-any.whl (46.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.4/46.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading flask_cors-6.0.2-py3-none-

### Intall and intialize mlflow (to plot metrics)

In [2]:
try:
  import mlflow
  print("mlflow is already installed")
except:
  !pip install mlflow

Collecting mlflow
  Downloading mlflow-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.8.1 (from mlflow)
  Downloading mlflow_skinny-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.8.1 (from mlflow)
  Downloading mlflow_tracing-3.8.1-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.5-py3-none-any.whl.metadata (4.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.8.1->mlflow)
  Downloading databricks_sdk-0.76.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m

### Import RapidFire Components

In [3]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

from rapidfireai import Experiment
from rapidfireai.evals.automl import List, RFLangChainRagSpec, RFvLLMModelConfig, RFPromptManager, RFGridSearch
import re, json
from typing import List as listtype, Dict, Any

# NB: If you get "AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'" from Colab, just rerun this cell

### Load Dataset, Rename Columns, and Downsample Data

sample_fraction set extremely low (0.1) to avoid Colab disconnect

In [4]:
from datasets import load_dataset
import pandas as pd
import json, random
from pathlib import Path

#FiQA dataset
dataset_dir = Path("/content/tutorial_notebooks/rag-contexteng/datasets")

#Load all the files
fiqa_dataset = load_dataset("json", data_files=str(dataset_dir / "fiqa" / "queries.jsonl"), split="train")
fiqa_dataset = fiqa_dataset.rename_columns({"text": "query", "_id": "query_id"})
qrels = pd.read_csv(str(dataset_dir / "fiqa" / "qrels.tsv"), sep="\t")
qrels = qrels.rename(
    columns={"query-id": "query_id", "corpus-id": "corpus_id", "score": "relevance"}
)

#Downsample queries and corpus JOINTLY
sample_fraction = 0.1
rseed = 1
random.seed(rseed)

# Sample queries
sample_size = int(len(fiqa_dataset) * sample_fraction)
fiqa_dataset = fiqa_dataset.shuffle(seed=rseed).select(range(sample_size))

# Convert query_ids to integers for matching
query_ids = set([int(qid) for qid in fiqa_dataset["query_id"]])

# All the corpus docs should now be pointing to a relevant query
qrels_filtered = qrels[qrels["query_id"].isin(query_ids)]
relevant_corpus_ids = set(qrels_filtered["corpus_id"].tolist())

print(f"Using {len(fiqa_dataset)} queries")
print(f"Found {len(relevant_corpus_ids)} relevant documents for these queries")

# Load corpus and filter to relevant docs
input_file = dataset_dir / "fiqa" / "corpus.jsonl"
output_file = dataset_dir / "fiqa" / "corpus_sampled.jsonl"

with open(input_file, 'r') as f:
    all_corpus = [json.loads(line) for line in f]

# Filter out any irrelevant documents
sampled_corpus = [doc for doc in all_corpus if int(doc["_id"]) in relevant_corpus_ids]

# Write sampled corpus
with open(output_file, 'w') as f:
    for doc in sampled_corpus:
        f.write(json.dumps(doc) + '\n')

print(f"Sampled {len(sampled_corpus)} documents from {len(all_corpus)} total")
print(f"Saved to: {output_file}")
print(f"Filtered qrels to {len(qrels_filtered)} relevance judgments")

# Update qrels to match
qrels = qrels_filtered

Generating train split: 0 examples [00:00, ? examples/s]

Using 664 queries
Found 1721 relevant documents for these queries
Sampled 1721 documents from 57638 total
Saved to: /content/tutorial_notebooks/rag-contexteng/datasets/fiqa/corpus_sampled.jsonl
Filtered qrels to 1721 relevance judgments


### Create Experiment

In [5]:
experiment = Experiment(experiment_name="exp1-fiqa-rag-colab", mode="evals")

Created directory for database at /content/rapidfireai/db
Experiment exp1-fiqa-rag-colab created with Experiment ID: 1 at /content/rapidfireai/rapidfire_experiments/exp1-fiqa-rag-colab
Created directory: /content/rapidfireai/logs/exp1-fiqa-rag-colab
üåê Google Colab detected. Ray dashboard URL: https://8855-gpu-t4-s-11pj5o8h5f5f7-b.us-west1-0.prod.colab.dev
üåê Google Colab detected. Dispatcher URL: https://8851-gpu-t4-s-11pj5o8h5f5f7-b.us-west1-0.prod.colab.dev


### Define Partial Multi-Config Knobs for LangChain part of RAG Pipeline using RapidFire AI Wrapper APIs

Note: encoding algorithm here is gpt2 with chunk size 150 and overlap 20 as well as chunk size 200 with overlao 60

In [6]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

# Per-Actor batch size for hardware efficiency
batch_size = 50


rag_gpu = RFLangChainRagSpec(
    document_loader=DirectoryLoader(
        path=str(dataset_dir / "fiqa"),
        glob="corpus_sampled.jsonl",
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "metadata_func": lambda record, metadata: {
                "corpus_id": int(record.get("_id"))
            },  # store the document id
            "json_lines": True,
            "text_content": False,
        },
        sample_seed=42,
    ),
    # chunking strategies with different chunk sizes (data chunking knob varied)
    text_splitter=List([
            RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                encoding_name="gpt2", chunk_size=150, chunk_overlap=20
            ),
            RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                encoding_name="gpt2", chunk_size=200, chunk_overlap=60
            ),
        ],
    ),
    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        "model_name": "sentence-transformers/all-MiniLM-L6-v2",
        "model_kwargs": {"device": "cuda:0"},
        "encode_kwargs": {"normalize_embeddings": True, "batch_size": batch_size},
    },
    vector_store=None,  # uses FAISS by default
    search_type="similarity",
    search_kwargs={"k": 2},
    # 2 reranking strategies with different top-n values (reranking knob varied)
    reranker_cls=CrossEncoderReranker,
    reranker_kwargs={
        "model_name": "cross-encoder/ms-marco-MiniLM-L6-v2",
        "model_kwargs": {"device": "cpu"},
        "top_n": List([1, 2]),
    },
    enable_gpu_search=True,
)

### Define Data Processing and Postprocessing Functions

In [7]:
def sample_preprocess_fn(
    batch: Dict[str, listtype], rag: RFLangChainRagSpec, prompt_manager: RFPromptManager
) -> Dict[str, listtype]:
    """Function to prepare the final inputs given to the generator model"""

    INSTRUCTIONS = "Utilize your financial knowledge, give your answer or opinion to the input question or subject matter."

    # Perform batched retrieval over all queries; returns a list of lists of k documents per query
    all_context = rag.get_context(batch_queries=batch["query"], serialize=False)


    retrieved_documents = [
        [doc.metadata["corpus_id"] for doc in docs] for docs in all_context
    ]


    serialized_context = rag.serialize_documents(all_context)
    batch["query_id"] = [int(query_id) for query_id in batch["query_id"]]
    return {
        "prompts": [
            [
                {"role": "system", "content": INSTRUCTIONS},
                {
                    "role": "user",
                    "content": f"Here is some relevant context:\n{context}. \nNow answer the following question using the context provided earlier:\n{question}",
                },
            ]
            for question, context in zip(batch["query"], serialized_context)
        ],
        "retrieved_documents": retrieved_documents,
        **batch,
    }


def sample_postprocess_fn(batch: Dict[str, listtype]) -> Dict[str, listtype]:
    """Function to postprocess outputs produced by generator model"""
    batch["ground_truth_documents"] = [
        qrels[qrels["query_id"] == query_id]["corpus_id"].tolist()
        for query_id in batch["query_id"]
    ]
    return batch

### Define Custom Eval Metrics Functions
Note: MRR is the focus of the experiment which meausres if the #1 ranked value is the correct retrieved document

In [8]:
import math


def compute_ndcg_at_k(retrieved_docs: set, expected_docs: set, k=5):
    """Utility function to compute NDCG@k"""
    relevance = [1 if doc in expected_docs else 0 for doc in list(retrieved_docs)[:k]]
    dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance))

    # IDCG: perfect ranking limited by min(k, len(expected_docs))
    ideal_length = min(k, len(expected_docs))
    ideal_relevance = [3] * ideal_length + [0] * (k - ideal_length)
    idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_relevance))

    return dcg / idcg if idcg > 0 else 0.0


def compute_rr(retrieved_docs: set, expected_docs: set):
    """Utility function to compute Reciprocal Rank (RR) for a single query"""
    rr = 0
    for i, retrieved_doc in enumerate(retrieved_docs):
        if retrieved_doc in expected_docs:
            rr = 1 / (i + 1)
            break
    return rr


def sample_compute_metrics_fn(batch: Dict[str, listtype]) -> Dict[str, Dict[str, Any]]:
    """Function to compute all eval metrics based on retrievals and/or generations"""

    true_positives, precisions, recalls, f1_scores, ndcgs, rrs = 0, [], [], [], [], []
    total_queries = len(batch["query"])

    for pred, gt in zip(batch["retrieved_documents"], batch["ground_truth_documents"]):
        expected_set = set(gt)
        retrieved_set = set(pred)

        true_positives = len(expected_set.intersection(retrieved_set))
        precision = true_positives / len(retrieved_set) if len(retrieved_set) > 0 else 0
        recall = true_positives / len(expected_set) if len(expected_set) > 0 else 0
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0
        )

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        ndcgs.append(compute_ndcg_at_k(retrieved_set, expected_set, k=5))
        rrs.append(compute_rr(retrieved_set, expected_set))
## below will return the correct metrics
    return {
        "Total": {"value": total_queries},
        "Precision": {"value": sum(precisions) / total_queries},
        "Recall": {"value": sum(recalls) / total_queries},
        "F1 Score": {"value": sum(f1_scores) / total_queries},
        "NDCG@5": {"value": sum(ndcgs) / total_queries},
        "MRR": {"value": sum(rrs) / total_queries},
    }


def sample_accumulate_metrics_fn(
    aggregated_metrics: Dict[str, listtype],
) -> Dict[str, Dict[str, Any]]:
    """Function to accumulate eval metrics across all batches"""

    num_queries_per_batch = [m["value"] for m in aggregated_metrics["Total"]]
    total_queries = sum(num_queries_per_batch)
    algebraic_metrics = ["Precision", "Recall", "F1 Score", "NDCG@5", "MRR"]

    return {
        "Total": {"value": total_queries},
        **{
            metric: {
                "value": sum(
                    m["value"] * queries
                    for m, queries in zip(
                        aggregated_metrics[metric], num_queries_per_batch
                    )
                )
                / total_queries,
                "is_algebraic": True,
                "value_range": (0, 1),
            }
            for metric in algebraic_metrics
        },
    }

### Define Partial Multi-Config Knobs for vLLM Generator part of RAG Pipeline using RapidFire AI Wrapper APIs

 Qwen2.5-0.5B-Instruct (0.5B parameters) is perfect for Colab's memory constraints and feasible for this experiment

 Here also has the configs below which can be varied like max_model_len, use_fpc

In [9]:
vllm_config1 = RFvLLMModelConfig(
    model_config={
        "model": "Qwen/Qwen2.5-0.5B-Instruct",
        "dtype": "half",
        "gpu_memory_utilization": 0.25,
        "tensor_parallel_size": 1,
        "distributed_executor_backend": "mp",
        "enable_chunked_prefill": False,
        "enable_prefix_caching": False,
        "max_model_len": 6000,
        "disable_log_stats": True,  # Disable vLLM progress logging
        "enforce_eager": True,
        "disable_custom_all_reduce": True,
    },
    sampling_params={
        "temperature": 0.8,
        "top_p": 0.95,
        "max_tokens": 128,
    },
    rag=rag_gpu,
    prompt_manager=None,
)

batch_size = 3 # Smaller batch size for generation
config_set = {
    "vllm_config": vllm_config1,  # Only 1 generator, but it represents 4 full configs
    "batch_size": batch_size,
    "preprocess_fn": sample_preprocess_fn,
    "postprocess_fn": sample_postprocess_fn,
    "compute_metrics_fn": sample_compute_metrics_fn,
    "accumulate_metrics_fn": sample_accumulate_metrics_fn,
    "online_strategy_kwargs": {
        "strategy_name": "normal",
        "confidence_level": 0.95,
        "use_fpc": True,
    },
}

### Create Config Group

In [10]:
config_group = RFGridSearch(config_set)

### Display Ray Dashboard

In [11]:
from google.colab import output
output.serve_kernel_port_as_iframe(8855)

<IPython.core.display.Javascript object>

### Run Multi-Config Evals + Launch Interactive Run Controller


RapidFire AI also provides an Interactive Controller panel UI for Colab that lets you manage executing runs dynamically in real-time from the notebook:



In [12]:
# Launch evals of all RAG configs in the config_group with swap granularity of 4 chunks
results = experiment.run_evals(
    config_group=config_group,
    dataset=fiqa_dataset,
    num_actors=1,
    num_shards=4,
    seed=42,
)

=== Preprocessing RAG Sources ===


RAG Source ID,Status,Duration,Details
1,Complete,59.5s,"FAISS, GPU"
2,Complete,59.9s,"FAISS, GPU"



=== Multi-Config Experiment Progress ===


Run ID,Model,Status,Progress,Conf. Interval,search_type,rag_k,top_n,chunk_size,chunk_overlap,sampling_params,model_config,Precision,Recall,F1 Score,NDCG@5,MRR,Throughput,Total,Samples Processed,Processing Time,Samples Per Second,model_name,run_id
1,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,2.0,1.0,150.0,20.0,"{'temperature': 0.8, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp', 'enable_chunked_prefill': False, 'enable_prefix_caching': False, 'max_model_len': 6000, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}","55.42% [55.42%, 55.42%]","49.03% [49.03%, 49.03%]","47.65% [47.65%, 47.65%]","16.83% [16.83%, 16.83%]","65.51% [65.51%, 65.51%]",0.2/s,664,664,5164.64 seconds,0.13,Qwen/Qwen2.5-0.5B-Instruct,1.0
2,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,2.0,2.0,150.0,20.0,"{'temperature': 0.8, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp', 'enable_chunked_prefill': False, 'enable_prefix_caching': False, 'max_model_len': 6000, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}","55.42% [55.42%, 55.42%]","49.03% [49.03%, 49.03%]","47.65% [47.65%, 47.65%]","16.83% [16.83%, 16.83%]","65.51% [65.51%, 65.51%]",0.2/s,664,664,4513.67 seconds,0.15,Qwen/Qwen2.5-0.5B-Instruct,2.0
3,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,2.0,1.0,200.0,60.0,"{'temperature': 0.8, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp', 'enable_chunked_prefill': False, 'enable_prefix_caching': False, 'max_model_len': 6000, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}","54.44% [54.44%, 54.44%]","47.88% [47.88%, 47.88%]","46.68% [46.68%, 46.68%]","16.53% [16.53%, 16.53%]","64.53% [64.53%, 64.53%]",0.2/s,664,664,4162.07 seconds,0.16,Qwen/Qwen2.5-0.5B-Instruct,3.0
4,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,2.0,2.0,200.0,60.0,"{'temperature': 0.8, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp', 'enable_chunked_prefill': False, 'enable_prefix_caching': False, 'max_model_len': 6000, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}","54.44% [54.44%, 54.44%]","47.88% [47.88%, 47.88%]","46.68% [46.68%, 46.68%]","16.53% [16.53%, 16.53%]","64.53% [64.53%, 64.53%]",0.2/s,664,664,3888.49 seconds,0.17,Qwen/Qwen2.5-0.5B-Instruct,4.0


Here results_df returns the data frame that gives the correct metrics

In [13]:
# Convert results dict to DataFrame
results_df = pd.DataFrame([
    {k: v['value'] if isinstance(v, dict) and 'value' in v else v for k, v in {**metrics_dict, 'run_id': run_id}.items()}
    for run_id, (_, metrics_dict) in results.items()
])

results_df

Unnamed: 0,run_id,model_name,search_type,rag_k,top_n,chunk_size,chunk_overlap,sampling_params,model_config,Samples Processed,Processing Time,Samples Per Second,Total,Precision,Recall,F1 Score,NDCG@5,MRR
0,1,Qwen/Qwen2.5-0.5B-Instruct,similarity,2,1,150,20,"{'temperature': 0.8, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",664,5164.64 seconds,0.13,664,0.554217,0.490263,0.476464,0.168279,0.65512
1,2,Qwen/Qwen2.5-0.5B-Instruct,similarity,2,2,150,20,"{'temperature': 0.8, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",664,4513.67 seconds,0.15,664,0.554217,0.490263,0.476464,0.168279,0.65512
2,3,Qwen/Qwen2.5-0.5B-Instruct,similarity,2,1,200,60,"{'temperature': 0.8, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",664,4162.07 seconds,0.16,664,0.544428,0.478755,0.466758,0.165288,0.645331
3,4,Qwen/Qwen2.5-0.5B-Instruct,similarity,2,2,200,60,"{'temperature': 0.8, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",664,3888.49 seconds,0.17,664,0.544428,0.478755,0.466758,0.165288,0.645331


This code checks to see if the results printed out the metrics you like

In [14]:
print(results_df)

   run_id                  model_name search_type  rag_k  top_n  chunk_size  \
0       1  Qwen/Qwen2.5-0.5B-Instruct  similarity      2      1         150   
1       2  Qwen/Qwen2.5-0.5B-Instruct  similarity      2      2         150   
2       3  Qwen/Qwen2.5-0.5B-Instruct  similarity      2      1         200   
3       4  Qwen/Qwen2.5-0.5B-Instruct  similarity      2      2         200   

   chunk_overlap                                    sampling_params  \
0             20  {'temperature': 0.8, 'top_p': 0.95, 'max_token...   
1             20  {'temperature': 0.8, 'top_p': 0.95, 'max_token...   
2             60  {'temperature': 0.8, 'top_p': 0.95, 'max_token...   
3             60  {'temperature': 0.8, 'top_p': 0.95, 'max_token...   

                                        model_config  Samples Processed  \
0  {'dtype': 'half', 'gpu_memory_utilization': 0....                664   
1  {'dtype': 'half', 'gpu_memory_utilization': 0....                664   
2  {'dtype': 'half', 'g

This adds plots to the folder in colab that can be viewed

In [15]:
import matplotlib.pyplot as plt
import mlflow

with mlflow.start_run():
    for metric in ["Precision", "Recall", "F1 Score", "NDCG@5", "MRR"]:
        plt.figure()
        plt.plot(results_df["run_id"], results_df[metric], marker="o")
        plt.title(f"{metric} vs run")
        plt.xlabel("Run ID")
        plt.ylabel(metric.upper())
        plt.grid(True)
        fname = f"{metric}_plot.png"
        plt.savefig(fname)
        mlflow.log_artifact(fname)
        plt.close()


2025/12/30 23:18:50 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/30 23:18:50 INFO mlflow.store.db.utils: Updating database tables
2025/12/30 23:18:50 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/30 23:18:50 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/30 23:18:50 INFO alembic.runtime.migration: Running 

### End Experiment

In [16]:
from google.colab import output
from IPython.display import display, HTML

display(HTML('''
<button id="continue-btn" style="padding: 10px 20px; font-size: 16px;">Click to End Experiment</button>
'''))

# eval_js blocks until the Promise resolves
output.eval_js('''
new Promise((resolve) => {
    document.getElementById("continue-btn").onclick = () => {
        document.getElementById("continue-btn").disabled = true;
        document.getElementById("continue-btn").innerText = "Continuing...";
        resolve("clicked");
    };
})
''')

# Actually end the experiment after the button is clicked
experiment.end()
print("Done!")

Experiment exp1-fiqa-rag-colab ended
Done!


### View RapidFire AI Log Files

In [17]:
# Get the experiment-specific log file
log_file = experiment.get_log_file_path()

print(f"üìÑ Log File: {log_file}")
print()

if log_file.exists():
    print("=" * 80)
    print(f"Last 30 lines of {log_file.name}:")
    print("=" * 80)
    with open(log_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[-30:]:
            print(line.rstrip())
else:
    print(f"‚ùå Log file not found: {log_file}")

üìÑ Log File: /content/rapidfireai/logs/exp1-fiqa-rag-colab/rapidfire.log

Last 30 lines of rapidfire.log:
2025-12-30 23:08:30 | QueryProcessingActor-0 | INFO | query_actor.py:169 | [exp1-fiqa-rag-colab:QueryProcessingActor-0] Deserializing FAISS index for this actor...
2025-12-30 23:08:30 | sentence_transformers.SentenceTransformer | INFO | SentenceTransformer.py:227 | Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-12-30 23:08:32 | QueryProcessingActor-0 | INFO | query_actor.py:178 | [exp1-fiqa-rag-colab:QueryProcessingActor-0] Recreated embedding function: HuggingFaceEmbeddings
2025-12-30 23:08:32 | QueryProcessingActor-0 | INFO | query_actor.py:187 | [exp1-fiqa-rag-colab:QueryProcessingActor-0] Created independent FAISS vector store for this actor
2025-12-30 23:08:32 | QueryProcessingActor-0 | INFO | query_actor.py:196 | [exp1-fiqa-rag-colab:QueryProcessingActor-0] Recreated retriever with search_type=similarity
2025-12-30 23:08:32 | QueryProcessin