In [None]:
!pip install flask flask-cors pyngrok


Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Downloading pyngrok-7.2.8-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok, flask-cors
Successfully installed flask-cors-5.0.1 pyngrok-7.2.8


In [None]:
!pip -qq install faiss-cpu huggingface_hub tiktoken blobfile bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
%%writefile ingest.py

import argparse
import os
import pickle

import faiss
import numpy as np
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer


def load_data(csv_path: str):
    """
    Load your CSV and return two parallel lists:
    - docs: the text to chunk (e.g. the Answer field)
    - metas: a list of dicts with whatever metadata you want to carry along
    """
    df = pd.read_csv(csv_path)
    # adjust column names as needed
    docs = df['Answer'].fillna('').astype(str).tolist()
    metas = [
        {
            'doc_id': row['Document_ID'],
            'question': row['Question'],
            'source': row['Document_Source'],
            'url': row['Document_URL']
        }
        for _, row in df.iterrows()
    ]
    return docs, metas


def chunk_text(text: str, size: int, overlap: int):
    """
    Simple word-based chunking with overlap.
    """
    tokens = text.split()
    chunks = []
    for start in range(0, len(tokens), size - overlap):
        chunk = tokens[start:start + size]
        if not chunk:
            break
        chunks.append(' '.join(chunk))
        if start + size >= len(tokens):
            break
    return chunks


def embed_chunks(
    chunks: list[str],
    tokenizer: AutoTokenizer,
    model: AutoModel,
    device: torch.device
) -> np.ndarray:
    """
    Tokenize + forward-pass each chunk, mean-pool the last hidden states.
    Returns an (N × D) array.
    """
    model.eval()
    all_embeds = []
    with torch.no_grad():
        for text in chunks:
            tokens = tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                padding='longest'
            ).to(device)

            outputs = model(**tokens)
            last_hidden = outputs.last_hidden_state    # (1, seq_len, D)
            mask = tokens['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
            # mean-pool only over non-padded tokens
            summed = (last_hidden * mask).sum(dim=1)      # (1, D)
            counts = mask.sum(dim=1)                      # (1, 1)
            pooled = summed / counts                      # (1, D)
            all_embeds.append(pooled.squeeze(0).cpu().numpy())

    return np.vstack(all_embeds)  # shape (N, D)


def build_and_save_index(
    embeddings: np.ndarray,
    metas: list[dict],
    out_dir: str
):
    """
    Normalize embeddings for inner-product similarity,
    build a FlatIP index, save index + metadata.
    """
    # normalize to unit length for IP = cosine
    faiss.normalize_L2(embeddings)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)

    os.makedirs(out_dir, exist_ok=True)
    faiss.write_index(index, os.path.join(out_dir, 'faiss.index'))

    with open(os.path.join(out_dir, 'metadata.pkl'), 'wb') as f:
        pickle.dump(metas, f)


def main(args):
    # 1) load
    docs, base_metas = load_data(args.csv_path)

    # 2) chunk + assemble metadata
    all_chunks = []
    chunk_metas = []
    for doc_text, meta in zip(docs, base_metas):
        chunks = chunk_text(doc_text, args.chunk_size, args.overlap)
        for i, c in enumerate(chunks):
            all_chunks.append(c)
            m = meta.copy()
            m['chunk_id'] = i
            chunk_metas.append(m)

    # 3) load model + embed
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = AutoModel.from_pretrained(args.model).to(device)

    embeddings = embed_chunks(all_chunks, tokenizer, model, device)

    # 4) build FAISS + save everything
    build_and_save_index(embeddings, chunk_metas, args.out_dir)
    print(f"Indexed {len(all_chunks)} chunks. Index + metadata saved to '{args.out_dir}'.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="RAG ingestion: CSV → chunks → PUBMEDBERT → FAISS"
    )
    parser.add_argument(
        "csv_path",
        help="path to your consolidated CSV (e.g. all_data.csv)"
    )
    parser.add_argument(
        "--out_dir", "-o",
        default="index_data",
        help="where to write faiss.index + metadata.pkl"
    )
    parser.add_argument(
        "--chunk_size", "-c", type=int, default=200,
        help="max words per chunk"
    )
    parser.add_argument(
        "--overlap", "-l", type=int, default=50,
        help="words overlap between consecutive chunks"
    )
    parser.add_argument(
        "--model", "-m",
        default="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
        help="HuggingFace model ID for PubMedBERT"
    )
    args = parser.parse_args()
    main(args)


Writing ingest.py


In [None]:
!python ingest.py all_data.csv --out_dir /outputs

tokenizer_config.json: 100% 28.0/28.0 [00:00<00:00, 229kB/s]
config.json: 100% 385/385 [00:00<00:00, 2.66MB/s]
vocab.txt: 100% 225k/225k [00:00<00:00, 3.49MB/s]
2025-05-14 04:32:55.104899: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747197175.390533    1935 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747197175.466718    1935 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-14 04:32:56.087511: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow w

In [None]:
%%writefile retrieve_generate.py

import argparse
import os
import pickle

import faiss
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

from huggingface_hub import login

def load_data(csv_path: str):
    # Load your CSV and return raw docs and base metadata
    df = pd.read_csv(csv_path)
    docs = df['Answer'].fillna('').astype(str).tolist()
    metas = [
        {
            'doc_id': row['Document_ID'],
            'source': row['Document_Source'],
            'url': row['Document_URL'],
        }
        for _, row in df.iterrows()
    ]
    return docs, metas

def chunk_text(text: str, size: int, overlap: int):
    tokens = text.split()
    chunks = []
    for start in range(0, len(tokens), size - overlap):
        chunk = tokens[start:start + size]
        if not chunk:
            break
        chunks.append(' '.join(chunk))
        if start + size >= len(tokens):
            break
    return chunks

def embed_query(text: str, tokenizer: AutoTokenizer, model: AutoModel, device: torch.device) -> np.ndarray:
    # Embed a single query and L2-normalize (for IndexFlatIP / cosine sim)
    model.eval()
    toks = tokenizer(text, return_tensors='pt', truncation=True, padding='longest').to(device)
    with torch.no_grad():
        out = model(**toks).last_hidden_state      # (1, seq_len, D)
        mask = toks['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
        summed = (out * mask).sum(dim=1)            # (1, D)
        counts = mask.sum(dim=1)                    # (1, 1)
        pooled = (summed / counts).cpu().numpy()    # (1, D)
    faiss.normalize_L2(pooled)
    return pooled

def main():
    parser = argparse.ArgumentParser("RAG Retrieval + Llama-2 Generation")
    parser.add_argument("--csv_path",    "-c", required=True, help="Your consolidated CSV")
    parser.add_argument("--index_dir",   "-i", default="index_data", help="Where ingest.py saved faiss.index + metadata.pkl")
    parser.add_argument("--embed_model", "-e", default="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
                        help="HuggingFace PubMedBERT model for embeddings")
    parser.add_argument("--llm_model",   "-l", default="meta-llama/Llama-2-7b-chat-hf",
                        help="HuggingFace Llama-2 chat model")
    parser.add_argument("--chunk_size",  type=int, default=200, help="Must match ingest.py")
    parser.add_argument("--overlap",     type=int, default=50,  help="Must match ingest.py")
    parser.add_argument("--top_k",       type=int, default=5,   help="How many chunks to retrieve")
    parser.add_argument("--query",       "-q", required=True,   help="Your question")
    parser.add_argument("--max_new_tokens", type=int, default=512, help="Generation length")
    parser.add_argument("--hf_token", help="hugging face token error")
    args = parser.parse_args()

    login(token=args.hf_token)

    # 1) Load FAISS index + metadata
    idx = faiss.read_index(os.path.join(args.index_dir, "faiss.index"))
    with open(os.path.join(args.index_dir, "metadata.pkl"), "rb") as f:
        chunk_metas = pickle.load(f)

    # 2) Re-load & re-chunk docs to recover chunk texts (same order as ingest.py)
    docs, base_metas = load_data(args.csv_path)
    all_chunks = []
    for doc_text in docs:
        all_chunks.extend(chunk_text(doc_text, args.chunk_size, args.overlap))

    # 3) Prepare embed model for retrieval
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embed_tokenizer = AutoTokenizer.from_pretrained(args.embed_model)
    embed_model     = AutoModel.from_pretrained(args.embed_model).to(device)

    # 4) Embed the query and search
    q_vec = embed_query(args.query, embed_tokenizer, embed_model, device)
    distances, indices = idx.search(q_vec, args.top_k)

    # 5) Gather retrieved context
    retrieved = []
    for dist, idx_ in zip(distances[0], indices[0]):
        retrieved.append({
            "score": float(dist),
            "meta":  chunk_metas[idx_],
            "text":  all_chunks[idx_],
        })

    # 6) Build prompt for Llama-2 chat
    context = "\n\n".join(
        f"[{i+1}] Source: {r['meta']['source']} | Text: {r['text']}"
        for i, r in enumerate(retrieved)
    )
    system_prompt = (
        "You are a knowledgeable medical assistant. "
        "Use the following retrieved context to answer the user's question."
    )
    user_prompt = (
        f"{system_prompt}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {args.query}\n"
        f"Answer:"
    )

    # 7) Load Llama-2 chat model (4-bit quantized) for generation
    bnb_config = BitsAndBytesConfig(load_in_4bit=True)
    llm_tokenizer = AutoTokenizer.from_pretrained(
        args.llm_model, use_fast=True, use_auth_token=True
    )
    llm_model = AutoModelForCausalLM.from_pretrained(
        args.llm_model,
        device_map="auto",
        quantization_config=bnb_config,
    )

    # 8) Tokenize & generate
    inputs = llm_tokenizer(user_prompt, return_tensors="pt").to(llm_model.device)
    output_ids = llm_model.generate(
        **inputs,
        max_new_tokens=args.max_new_tokens,
        temperature=0.2,
        top_p=0.9,
        eos_token_id=llm_tokenizer.eos_token_id,
        pad_token_id=llm_tokenizer.pad_token_id,
    )
    answer = llm_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 9) Print out the answer + options for inspection
    print("\n=== Retrieved Contexts ===")
    for r in retrieved:
        print(f"• (score={r['score']:.4f}) {r['text'][:200]}…\n")
    print("\n=== Answer ===")
    print(answer)

if __name__ == "__main__":
    main()



usage: RAG Retrieval + Llama-2 Generation [-h] --csv_path CSV_PATH
                                          [--index_dir INDEX_DIR]
                                          [--embed_model EMBED_MODEL]
                                          [--llm_model LLM_MODEL]
                                          [--chunk_size CHUNK_SIZE]
                                          [--overlap OVERLAP] [--top_k TOP_K]
                                          --query QUERY
                                          [--max_new_tokens MAX_NEW_TOKENS]
                                          [--hf_token HF_TOKEN]
RAG Retrieval + Llama-2 Generation: error: the following arguments are required: --csv_path/-c, --query/-q


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
def get_final_response(
    query,
    csv_path="all_data.csv",
    index_dir="/outputs",
    embed_model="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    llm_model="meta-llama/Llama-2-7b-chat-hf",
    chunk_size=200,
    overlap=50,
    top_k=5,
    max_new_tokens=512,
    hf_token=None
):
    import os, pickle, faiss, torch
    import pandas as pd
    import numpy as np
    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig
    from huggingface_hub import login

    def load_data(csv_path):
        df = pd.read_csv(csv_path)
        docs = df['Answer'].fillna('').astype(str).tolist()
        metas = [
            {'doc_id': row['Document_ID'], 'source': row['Document_Source'], 'url': row['Document_URL']}
            for _, row in df.iterrows()
        ]
        return docs, metas

    def chunk_text(text, size, overlap):
        tokens = text.split()
        return [' '.join(tokens[i:i+size]) for i in range(0, len(tokens), size - overlap)]

    def embed_query(text, tokenizer, model, device):
        toks = tokenizer(text, return_tensors='pt', truncation=True, padding='longest').to(device)
        with torch.no_grad():
            out = model(**toks).last_hidden_state
            mask = toks['attention_mask'].unsqueeze(-1)
            pooled = ((out * mask).sum(dim=1) / mask.sum(dim=1)).cpu().numpy()
        faiss.normalize_L2(pooled)
        return pooled

    # ✅ Authenticate
    login(token=hf_token)

    # Load index + data
    idx = faiss.read_index(os.path.join(index_dir, "faiss.index"))
    with open(os.path.join(index_dir, "metadata.pkl"), "rb") as f:
        chunk_metas = pickle.load(f)

    docs, base_metas = load_data(csv_path)
    all_chunks = [chunk for doc in docs for chunk in chunk_text(doc, chunk_size, overlap)]

    # Embed
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embed_tokenizer = AutoTokenizer.from_pretrained(embed_model)
    embed_model = AutoModel.from_pretrained(embed_model).to(device)
    q_vec = embed_query(query, embed_tokenizer, embed_model, device)
    distances, indices = idx.search(q_vec, top_k)

    # Prompt
    retrieved = [
        {"score": float(dist), "meta": chunk_metas[idx_], "text": all_chunks[idx_]}
        for dist, idx_ in zip(distances[0], indices[0])
    ]
    context = "\n\n".join(
        f"[{i+1}] Source: {r['meta']['source']} | Text: {r['text']}" for i, r in enumerate(retrieved)
    )
    user_prompt = (
        f"You are a knowledgeable medical assistant.\n\nContext:\n{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    # Generate
    bnb_config = BitsAndBytesConfig(load_in_4bit=True)
    llm_tokenizer = AutoTokenizer.from_pretrained(llm_model, use_fast=True, use_auth_token=True)
    llm_model = AutoModelForCausalLM.from_pretrained(
        llm_model,
        device_map="auto",
        quantization_config=bnb_config,
    )
    inputs = llm_tokenizer(user_prompt, return_tensors="pt").to(llm_model.device)
    output_ids = llm_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.2,
        top_p=0.9,
        eos_token_id=llm_tokenizer.eos_token_id,
        pad_token_id=llm_tokenizer.pad_token_id,
    )
    return llm_tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
from pyngrok import ngrok
ngrok.kill()

In [None]:
!npm install -g localtunnel
!lt --port 5000 --subdomain youruniquename

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K
added 22 packages in 2s
[1G[0K⠦[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details
[1G[0K⠦[1G[0Kyour url is: https://youruniquename.loca.lt


INFO:werkzeug:127.0.0.1 - - [11/May/2025 21:41:07] "OPTIONS /api/ask HTTP/1.1" 200 -


📥 Received query: what is cure for cough?


INFO:werkzeug:127.0.0.1 - - [11/May/2025 21:41:07] "[35m[1mPOST /api/ask HTTP/1.1[0m" 500 -


❌ Error: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:67: Error: 'f' failed: could not open /content/outputs/faiss.index for reading: No such file or directory


INFO:werkzeug:127.0.0.1 - - [11/May/2025 21:44:55] "OPTIONS /api/ask HTTP/1.1" 200 -


^C


In [None]:
# updated_backend_api.py

import threading
from flask import Flask, request, jsonify, make_response
from flask_cors import CORS
from pyngrok import ngrok, conf


conf.get_default().auth_token = "2wvOhQm5BR5UeErvyNiT9Q1gKMJ_5WQbX4J8Vkt6uab9UNmxD"
HF_TOKEN = "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"


app = Flask(__name__)

CORS(
    app,
    resources={r"/*": {"origins": "*"}},
    supports_credentials=True
)

@app.before_request
def handle_options():
    if request.method == "OPTIONS":
        resp = make_response()
        resp.headers["Access-Control-Allow-Origin"] = "*"
        resp.headers["Access-Control-Allow-Methods"] = "GET,POST,OPTIONS"
        resp.headers["Access-Control-Allow-Headers"] = "Content-Type,Authorization"
        return resp

@app.route("/api/ask", methods=["GET", "POST", "OPTIONS"])
def ask():
    if request.method == "GET":
        return "✅ Flask is running! POST JSON {query: ...} to get an answer."

    data = request.get_json(force=True)
    query = data.get("query", "")
    print("📥 Received query:", query)

    try:
        answer = get_final_response(query, hf_token=HF_TOKEN)
        return jsonify({"answer": answer})
    except Exception as e:
        print("❌ Error in get_final_response:", e)
        return jsonify({"answer": f"Error: {e}"}), 500

def run_app():
    app.run(host="0.0.0.0", port=5000, debug=False)

if __name__ == "__main__":
    ngrok.kill()
    public_url = ngrok.connect(5000)
    print(f"\n🌐 Ngrok URL (use this in your React frontend): {public_url}\n")
    threading.Thread(target=run_app, daemon=True).start()



🌐 Ngrok URL (use this in your React frontend): NgrokTunnel: "https://e5e0-34-34-110-22.ngrok-free.app" -> "http://localhost:5000"



In [None]:
# Testing url

import requests
r = requests.post("http://127.0.0.1:5000/api/ask", json={"query": "What is anemia?"})
print(r.json())

📥 Received query: What is anemia?


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO:werkzeug:127.0.0.1 - - [11/May/2025 18:40:09] "[35m[1mPOST /api/ask HTTP/1.1[0m" 500 -


❌ Error: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
{'answer': 'Error: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. '}


# New Section

In [None]:
!python retrieve_generate.py --csv_path /content/all_data.csv --index_dir=/content/outputs --query "What causes L-arginine:glycine amidinotransferase deficiency?" --hf_token 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

In [None]:
#Evaluation
!pip install sacrebleu nltk --quiet

import nltk
nltk.download('punkt')

import pandas as pd
from sacrebleu import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(csv_path: str, hf_token: str):

    df = pd.read_csv(csv_path)
    questions = df['Question'].tolist()
    references = df['Answer'].tolist()

    preds = []
    for q in questions:
        try:
            ans = get_final_response(q, hf_token=hf_token)
        except ValueError as e:
            msg = str(e)
            if "Some modules are dispatched on the CPU or the disk" in msg:

                ans = ""
            else:

                raise
        except Exception as e:
            print(f" Unexpected error on query {q!r}: {e}")
            ans = ""
        preds.append(ans)

    bleu = corpus_bleu(preds, [references])
    print(f"\n➡️ Corpus BLEU: {bleu.score:.2f}\n")
    print(bleu)

    print("\n➡️ Sentence-level BLEU:")
    smooth = SmoothingFunction().method4
    for i, (p, r) in enumerate(zip(preds, references), 1):
        score = sentence_bleu([r.split()], p.split(), smoothing_function=smooth)
        print(f"{i:3d}: {score:.3f}")

evaluate_bleu(
    csv_path="all_data.csv",
    hf_token="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"
)
