In [2]:
#!/usr/bin/env python3
"""
RAG-Integrated Language Model Selector
- Runs several language-ID models over multilingual samples
- Collects accuracy & latency
- Builds FAISS vector DB over benchmarks.csv
- Retrieves evidence for model selection
- Uses Ollama (Qwen) to pick best model based on:
    • Highest accuracy
    • Or fastest model meeting target accuracy
"""

import warnings
import logging
import time
import json
import torch
import pandas as pd
import numpy as np
import faiss

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer

# Optional libraries
try:
    import langid
    LANGID_AVAILABLE = True
except:
    LANGID_AVAILABLE = False
    print("langid.py not installed. Install with: pip install langid")

try:
    from ollama import chat
    OLLAMA_AVAILABLE = True
except:
    OLLAMA_AVAILABLE = False
    print("Ollama not installed. RAG-LM selection disabled.")

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

# Config
BENCHMARK_PATH = "benchmarks_ld.csv"
TOP_K_EVIDENCE = 5

LANGUAGE_MODELS = [
    "papluca/xlm-roberta-base-language-detection",
    "Joshi-Aryan/Fine_Tuned_HF_Language_Identification_Model",
    "langid.py"
]

# Device
DEVICE = (
    torch.device("cuda") if torch.cuda.is_available()
    else torch.device("mps") if torch.backends.mps.is_available()
    else torch.device("cpu")
)
print("Using device:", DEVICE)

# Load multilingual samples
languages = ["en", "fr", "es", "de"]
texts, true_labels = [], []

for lang in languages:
    ds = load_dataset("wiki40b", lang, split="train[:25]")
    texts.extend(ds["text"])
    true_labels.extend([lang] * len(ds))

# Model runners
def run_model_hf(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(DEVICE)
    model.eval()

    preds = []
    for t in texts:
        enc = tokenizer(t, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
        with torch.no_grad():
            out = model(**enc)
        pred_id = out.logits.argmax(dim=1).item()
        preds.append(model.config.id2label[pred_id].lower()[:2])
    return preds

def run_langid_model(texts):
    if not LANGID_AVAILABLE:
        raise RuntimeError("langid.py not installed.")
    return [langid.classify(t)[0] for t in texts]

def run_any_model(model_name, texts):
    if model_name == "langid.py":
        return run_langid_model(texts)
    return run_model_hf(model_name, texts)

# Execute all models
def run_batch(texts, true_labels):
    results = []
    for model in LANGUAGE_MODELS:
        try:
            print(f"Running model: {model}")
            start = time.time()
            preds = run_any_model(model, texts)
            latency = round(time.time() - start, 2)
            acc = round(accuracy_score(true_labels, preds) * 100, 2)

            print(f"Acc={acc}% | Latency={latency}s")
            results.append({
                "model": model,
                "accuracy": acc,
                "latency": latency
            })
        except Exception as e:
            print(f"FAILED {model}: {e}")
    return results

# Build FAISS (RAG)
def build_vector_db(path):
    df = pd.read_csv(path)
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    docs = [
        f"Model: {r['model']} | Dataset: {r['dataset']} | Accuracy: {r['accuracy']} | Latency: {r['latency']} | Notes: {r['notes']}"
        for _, r in df.iterrows()
    ]
    emb = embedder.encode(docs, show_progress_bar=False)
    dim = emb.shape[1]

    index = faiss.IndexFlatL2(dim)
    index.add(np.array(emb, dtype="float32"))
    return df, index, embedder, docs

def rag_retrieve(query, df, index, embedder, docs, k=TOP_K_EVIDENCE):
    q_emb = embedder.encode([query]).astype("float32")
    D, I = index.search(q_emb, k)
    return "\n".join(docs[i] for i in I[0])

# Qwen model selection
def qwen_select(results, rag_context, goal, target_acc=None):
    if not OLLAMA_AVAILABLE:
        print("Ollama not available.")
        return None

    prompt = f"""
        You are a strict evaluator selecting the best **language identification** model.

        Retrieved benchmark evidence:
        {rag_context}

        Evaluation results on sample data:
        {json.dumps(results, indent=2)}

        Selection rules:
        - If goal=accuracy → choose highest accuracy (tie → lowest latency)
        - If goal=latency → choose fastest model with accuracy ≥ {target_acc}
        - Output ONLY the model name.
    """

    resp = chat(model="qwen2.5:7b", messages=[{"role": "user", "content": prompt}])
    return resp.message.content.strip()

# Main
if __name__ == "__main__":
    print("\n1) Most accurate model")
    print("2) Fastest model above accuracy threshold")
    choice = input("\nQuery options:").strip()

    results = run_batch(texts, true_labels)

    df, index, embedder, docs = build_vector_db(BENCHMARK_PATH)
    rag = rag_retrieve(
        "best multilingual language identification model",
        df, index, embedder, docs
    )

    if choice == "1":
        selected = qwen_select(results, rag, goal="accuracy")
    else:
        target = float(input("Enter target accuracy: "))
        selected = qwen_select(results, rag, goal="latency", target_acc=target)

    print("\n=== SELECTED MODEL ===")
    print(selected)


Using device: mps

1) Most accurate model
2) Fastest model above accuracy threshold
Running model: papluca/xlm-roberta-base-language-detection
Acc=95.0% | Latency=6.77s
Running model: Joshi-Aryan/Fine_Tuned_HF_Language_Identification_Model
Acc=50.0% | Latency=6.55s
Running model: langid.py
Acc=99.0% | Latency=0.05s

=== SELECTED MODEL ===
Based on the provided evaluation results and selection rules, here is the chosen model:

For **accuracy** as the goal:
- The highest accuracy among the evaluated models is 99.0%, which is achieved by the `langid.py` model.

Therefore, the selected model for the accuracy goal is: **langid.py**

For **latency** as the goal:
- The fastest model with an accuracy ≥ None (i.e., any non-zero accuracy) is the `papluca/xlm-roberta-base-language-detection` model with a latency of 6.77.

Therefore, the selected model for the latency goal is: **papluca/xlm-roberta-base-language-detection**
