**Azure Open AI Endpoint and Key:  GPT - 4o - Mini Model**

In [None]:
import os
from openai import AzureOpenAI

endpoint = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
model_name = "gpt-4o-mini"
deployment = "gpt-4o-mini"

subscription_key = "#hidden"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "I am going to Paris, what should I see?",
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

print(response.choices[0].message.content)

Paris is a city rich in history, art, culture, and beauty. Here’s a list of must-see attractions and experiences to consider during your visit:

1. **Eiffel Tower** - A symbol of Paris, you can either admire it from the ground or take an elevator to the top for stunning views of the city.

2. **Louvre Museum** - Home to thousands of works of art, including the Mona Lisa and the Venus de Milo. It's advisable to plan your visit, as it can be overwhelming due to its size.

3. **Notre-Dame Cathedral** - Although it is undergoing restoration, the exterior remains impressive. Explore the Île de la Cité while you’re there.

4. **Sacré-Cœur Basilica** - Located on Montmartre hill, this basilica offers beautiful views of Paris and features stunning mosaics inside.

5. **Champs-Élysées and Arc de Triomphe** - Stroll down this famous avenue and visit the iconic arch at the western end.

6. **Palace of Versailles** - A day trip from Paris, this opulent palace and its gardens are a perfect example 

In [None]:
pip install ir_datasets tqdm

Collecting ir_datasets
  Downloading ir_datasets-0.5.11-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.6.0-py3-none-any.whl.metadata (25 kB)
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.5-py3-none-any.whl.metadata (2.2 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting zlib-state>=0.1.3 (from ir_datasets)
  Downloading zlib_state-0.1.10-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting ijson>=3.1.3 (from ir_datasets)
  Downloading ijson-3.4.0.post0-

**Collecting Base Query Documents from msmarco and beir**

In [None]:
"""
FINAL Step A — Build base_dataset_questq.jsonl
Compatible with your available BEIR datasets.
Collects query–document pairs for multi-attribute dataset building.
"""

import ir_datasets, random, json
from tqdm import tqdm

# ✅ Choose only available dataset identifiers
SAMPLES = {
    "msmarco-passage/train": 400,            # open-domain search
    "beir/cqadupstack/programmers": 100,     # StackOverflow-style Q&A
    "beir/fever": 100,                       # fact verification
    "beir/scidocs": 50,                      # academic domain
    "beir/quora": 50,                        # question paraphrasing
}

random.seed(42)

def sample_dataset(name, sample_size):
    print(f"\n📘 Processing {name}")
    ds = ir_datasets.load(name)

    # Ensure dataset has the necessary parts
    for needed in ["queries_iter", "docs_iter", "qrels_iter"]:
        if not hasattr(ds, needed):
            raise AttributeError(f"Dataset {name} missing: {needed}")

    queries = {q.query_id: q.text for q in ds.queries_iter()}
    docs = {d.doc_id: d.text for d in ds.docs_iter()}
    qrels = list(ds.qrels_iter())

    # Build mapping query → list of relevant doc IDs
    pos_map = {}
    for q in qrels:
        if q.relevance > 0:
            pos_map.setdefault(q.query_id, []).append(q.doc_id)

    valid_qids = list(pos_map.keys())
    if not valid_qids:
        print(f"⚠️ No valid qrels for {name}")
        return []

    chosen_qids = random.sample(valid_qids, min(sample_size, len(valid_qids)))
    results = []

    for qid in tqdm(chosen_qids):
        query = queries.get(qid, "")
        pos_docs = pos_map.get(qid, [])
        for did in pos_docs[:3]:  # take up to 3 positive docs
            if did in docs:
                results.append({
                    "dataset": name,
                    "query_id": qid,
                    "query": query,
                    "document": docs[did],
                    "relevance": 1
                })

    print(f"✅ Collected {len(results)} pairs from {name}")
    return results


# ------------------ MAIN EXECUTION ------------------
all_data = []
for name, n in SAMPLES.items():
    try:
        all_data.extend(sample_dataset(name, n))
    except Exception as e:
        print(f"⚠️ Skipping {name}: {e}")

# Save the combined dataset
with open("1_base_dataset_questq.jsonl", "w", encoding="utf8") as f:
    for item in all_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\n🎯 Done! Collected {len(all_data)} total query–doc pairs.")
print("📂 Output saved to base_dataset_questq.jsonl")


**Adding Attributes to Base Dataset**

In [None]:
import json, time, random
from tqdm import tqdm
from openai import AzureOpenAI

# 🔹 Azure setup
endpoint = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
api_version = "2024-12-01-preview"
deployment = "gpt-4o-mini"
api_key = "#hidden"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=api_key
)

# 🔹 Load base dataset
with open("1_base_dataset_questq.jsonl") as f:
    base_data = [json.loads(l) for l in f]

# 🔹 Few-shot examples to guide GPT
FEWSHOT = """
Examples:
1️⃣ Query: "what is the purpose of DNA replication"
Document: "DNA replication ensures each cell gets an exact copy of the DNA during cell division."
Attributes:
{
 "audience": "Student",
 "keyword": ["Biology", "Genetics"],
 "format": "Academic Paper",
 "language": "English",
 "length": "Short",
 "source": "Wikipedia"
}

2️⃣ Query: "price of a bushel of wheat"
Document: "Interactive chart of historical daily wheat prices..."
Attributes:
{
 "audience": "Researcher",
 "keyword": ["Economics", "Agriculture"],
 "format": "Report",
 "language": "English",
 "length": "Short",
 "source": "NewsSite"
}
"""

# 🔹 Attribute generation function
def get_attributes(query, document, retries=2):
    prompt = f"""
You are labeling information retrieval data using InfoSearch-style attributes.

{FEWSHOT}

Now label this new pair.
Return ONLY a valid JSON dictionary (no explanations, no markdown).

Query: {query}
Document: {document[:800]}
JSON:
"""
    for _ in range(retries):
        try:
            resp = client.chat.completions.create(
                model=deployment,
                temperature=0.4,
                max_tokens=250,
                messages=[{"role": "user", "content": prompt}]
            )
            content = resp.choices[0].message.content.strip()
            # Clean and try parsing JSON
            start = content.find("{")
            end = content.rfind("}") + 1
            json_part = content[start:end]
            attrs = json.loads(json_part)
            # Ensure all expected keys exist
            required = {"audience", "keyword", "format", "language", "length", "source"}
            if required.issubset(attrs.keys()):
                return attrs
        except Exception as e:
            time.sleep(1)
            continue
    return None

# 🔹 Process all samples
enriched = []
for i, item in enumerate(tqdm(base_data, desc="Annotating")):
    attrs = get_attributes(item["query"], item["document"])
    if attrs:
        item.update(attrs)
        enriched.append(item)

# 🔹 Save
with open("2_base_with_multi_attri.jsonl", "w", encoding="utf8") as f:
    for e in enriched:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

print(f"✅ Created multi_attr_dataset.jsonl with {len(enriched)} labeled pairs.")


**Multi-Attribute Instructed + Reversed Query Generation**

In [None]:
#1. Install required packages ---
!pip install openai aiohttp nest_asyncio -q

#2. Imports and setup ---
import json, re, time, asyncio, nest_asyncio
from openai import AsyncAzureOpenAI
nest_asyncio.apply()

#3. Azure OpenAI configuration ---
API_KEY = "#hidden"
ENDPOINT = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
API_VERSION = "2024-12-01-preview"
MODEL = "gpt-4o-mini"

client = AsyncAzureOpenAI(
    api_key=API_KEY,
    azure_endpoint=ENDPOINT,
    api_version=API_VERSION
)

#4. Rate-limit and save settings ---
TOKENS_PER_MINUTE = 100_000
REQUESTS_PER_MINUTE = 1000
MAX_CONCURRENT_REQUESTS = 8
SLEEP_BETWEEN_REQUESTS = 60 / REQUESTS_PER_MINUTE
SAVE_INTERVAL = 50   # Auto-save every 50 queries

def build_prompt(query, attributes):
    """Prompt asking GPT to return instructed and reversed versions as strict JSON."""
    attr_text = ", ".join([f"{k}: {v}" for k, v in attributes.items() if v])
    return f"""
You are generating search queries with multiple document-level attributes.

Given a base query and its attributes, produce:
1. An instructed version that naturally includes 2–3 attributes.
2. A reversed instructed version that logically negates those same attributes.

Return output ONLY as a JSON object with two keys:
"instructed_query" and "reversed_query".

Example:
Base query: "best travel destinations in Europe"
Attributes: format=blog, language=English, audience=layman

Output:
{{
  "instructed_query": "List the best travel destinations in Europe. Please provide a blog in English for laymen.",
  "reversed_query": "List the best travel destinations in Europe. Please do not provide a blog in English for laymen."
}}

Now for this:
Base query: "{query}"
Attributes: {attr_text}
"""

def safe_json_parse(text):
    """Try robust JSON parsing; fallback to regex extraction if needed."""
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        instructed = re.search(r'"?instructed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        reversed_q = re.search(r'"?reversed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        return {
            "instructed_query": instructed.group(1).strip() if instructed else "",
            "reversed_query": reversed_q.group(1).strip() if reversed_q else ""
        }

# Async GPT-4 call for one entry

async def process_entry(entry):
    query = entry["query"]
    attributes = {
        "audience": entry.get("audience", ""),
        "format": entry.get("format", ""),
        "language": entry.get("language", ""),
        "length": entry.get("length", ""),
        "source": entry.get("source", "")
    }

    prompt = build_prompt(query, attributes)

    try:
        response = await client.chat.completions.create(
            model=MODEL,
            temperature=0.4,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}  # force JSON output
        )

        content = response.choices[0].message.content.strip()
        parsed = safe_json_parse(content)

        entry["instructed_query"] = parsed.get("instructed_query", "")
        entry["reversed_query"] = parsed.get("reversed_query", "")
        return entry

    except Exception as e:
        print(f"❌ Error on {entry.get('query_id','?')}: {e}")
        return None


# Async batch generator

async def generate_instructed_queries():
    input_file = "2_base_with_multi_attri.jsonl"
    output_file = "multi_attr_instructed.jsonl"

    with open(input_file, "r") as f:
        data = [json.loads(line) for line in f]

    print(f"📘 Loaded {len(data)} entries from {input_file}")

    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    processed = []

    async def limited_process(entry):
        async with semaphore:
            result = await process_entry(entry)
            await asyncio.sleep(SLEEP_BETWEEN_REQUESTS)
            return result

    tasks = [limited_process(e) for e in data]

    for i, coro in enumerate(asyncio.as_completed(tasks), 1):
        result = await coro
        if result:
            processed.append(result)

        # Auto-save every N records
        if i % SAVE_INTERVAL == 0:
            with open(output_file, "a") as f:
                for p in processed[-SAVE_INTERVAL:]:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"💾 Auto-saved {i} entries...")

    # Final save
    with open(output_file, "a") as f:
        for p in processed:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"✅ Finished generating {len(processed)} instructed queries!")
    print(f"📁 Output: {output_file}")


await generate_instructed_queries()


📘 Loaded 715 entries from multi_attr_dataset.jsonl
💾 Auto-saved 50 entries...
💾 Auto-saved 100 entries...
💾 Auto-saved 150 entries...
💾 Auto-saved 200 entries...
💾 Auto-saved 250 entries...
💾 Auto-saved 300 entries...
💾 Auto-saved 350 entries...
💾 Auto-saved 400 entries...
💾 Auto-saved 450 entries...
💾 Auto-saved 500 entries...
💾 Auto-saved 550 entries...
💾 Auto-saved 600 entries...
💾 Auto-saved 650 entries...
💾 Auto-saved 700 entries...
✅ Finished generating 715 instructed queries!
📁 Output: multi_attr_instructed.jsonl


**Document Rewriting + Hard Negative Generation**

In [None]:
# ====================================================
# 🚀 Step 4 — Document Rewriting + Hard Negative Generation
# ====================================================

!pip install openai aiohttp nest_asyncio -q

import json, re, time, asyncio, nest_asyncio
from openai import AsyncAzureOpenAI
nest_asyncio.apply()

# --- Azure config (same as Step 3) ---
API_KEY = "#hidden"
ENDPOINT = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
API_VERSION = "2024-12-01-preview"
MODEL = "gpt-4o-mini"

client = AsyncAzureOpenAI(
    api_key=API_KEY,
    azure_endpoint=ENDPOINT,
    api_version=API_VERSION
)

# --- Rate control ---
MAX_CONCURRENT_REQUESTS = 8
REQUESTS_PER_MINUTE = 1000
SLEEP_BETWEEN_REQUESTS = 60 / REQUESTS_PER_MINUTE
SAVE_INTERVAL = 50


# ====================================================
# 🧠 Prompt builder
# ====================================================

def build_doc_prompt(document, attributes):
    attr_text = ", ".join([f"{k}: {v}" for k, v in attributes.items() if v])
    return f"""
You are refining a document for an information-retrieval benchmark.

Task 1 – Rewrite the base document so that it **fully satisfies all given attributes**.
Task 2 – Create **one hard negative document** that is **topically similar** but **violates at least one attribute**
(e.g., wrong format, language, or audience).

Return **only** a JSON object:
{{
  "positive_doc": "... rewritten version matching attributes ...",
  "hard_negative_doc": "... realistic but violating version ..."
}}

Base document:
\"\"\"{document}\"\"\"
Attributes: {attr_text}
"""

# ====================================================
# ⚙️ Async single entry processor
# ====================================================

def safe_json_parse(text):
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pos = re.search(r'"?positive[_ ]?doc"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        neg = re.search(r'"?hard[_ ]?negative[_ ]?doc"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        return {
            "positive_doc": pos.group(1).strip() if pos else "",
            "hard_negative_doc": neg.group(1).strip() if neg else ""
        }

async def process_doc(entry):
    document = entry.get("document", "")
    attributes = {
        "audience": entry.get("audience", ""),
        "format": entry.get("format", ""),
        "language": entry.get("language", ""),
        "length": entry.get("length", ""),
        "source": entry.get("source", "")
    }

    prompt = build_doc_prompt(document, attributes)
    try:
        response = await client.chat.completions.create(
            model=MODEL,
            temperature=0.5,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        content = response.choices[0].message.content.strip()
        parsed = safe_json_parse(content)
        entry["positive_doc"] = parsed.get("positive_doc", "")
        entry["hard_negative_doc"] = parsed.get("hard_negative_doc", "")
        return entry
    except Exception as e:
        print(f"❌ Error on {entry.get('query_id','?')}: {e}")
        return None


# ====================================================
# 🚀 Async batch processor
# ====================================================

async def rewrite_documents():
    input_file = "multi_attr_instructed.jsonl"
    output_file = "multi_attr_docs.jsonl"

    with open(input_file, "r") as f:
        data = [json.loads(line) for line in f]

    print(f"📘 Loaded {len(data)} entries from {input_file}")
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    processed = []

    async def limited(entry):
        async with semaphore:
            result = await process_doc(entry)
            await asyncio.sleep(SLEEP_BETWEEN_REQUESTS)
            return result

    tasks = [limited(e) for e in data]

    for i, coro in enumerate(asyncio.as_completed(tasks), 1):
        res = await coro
        if res:
            processed.append(res)
        if i % SAVE_INTERVAL == 0:
            with open(output_file, "a") as f:
                for p in processed[-SAVE_INTERVAL:]:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"💾 Saved {i} entries...")

    # final save
    with open(output_file, "a") as f:
        for p in processed:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"✅ Finished rewriting {len(processed)} documents!")
    print(f"📁 Output: {output_file}")

# ====================================================
# ▶️ Run rewriting inside Colab/Jupyter
# ====================================================
await rewrite_documents()


📘 Loaded 2818 entries from multi_attr_instructed.jsonl
💾 Saved 50 entries...
❌ Error on 919029: 'NoneType' object has no attribute 'strip'
💾 Saved 100 entries...
💾 Saved 150 entries...
💾 Saved 200 entries...
💾 Saved 250 entries...
💾 Saved 300 entries...
💾 Saved 350 entries...
💾 Saved 400 entries...
💾 Saved 450 entries...
💾 Saved 500 entries...
💾 Saved 550 entries...
💾 Saved 600 entries...
💾 Saved 650 entries...
💾 Saved 700 entries...
💾 Saved 750 entries...
💾 Saved 800 entries...
💾 Saved 850 entries...
💾 Saved 900 entries...
💾 Saved 950 entries...
💾 Saved 1000 entries...
💾 Saved 1050 entries...
💾 Saved 1100 entries...
💾 Saved 1150 entries...
💾 Saved 1200 entries...
💾 Saved 1250 entries...
💾 Saved 1300 entries...
❌ Error on 737282: 'NoneType' object has no attribute 'strip'
💾 Saved 1350 entries...
💾 Saved 1400 entries...
💾 Saved 1450 entries...
💾 Saved 1500 entries...
💾 Saved 1550 entries...
💾 Saved 1600 entries...
💾 Saved 1650 entries...
💾 Saved 1700 entries...
💾 Saved 1750 entries...
💾

**UPDATED WITH ATTRIBUTES MENTIONED**

**Multi-Attribute Instructed + Reversed Query Generation + Document Rewriting + Hard Negative Generation** (Updated)


In [None]:
# =========================================================
# STEP 1: Install & Imports
# =========================================================
!pip install openai aiohttp nest_asyncio -q

import json, re, time, asyncio, nest_asyncio, random, copy
from openai import AsyncAzureOpenAI
nest_asyncio.apply()

# =========================================================
# STEP 2: Azure OpenAI Configuration
# =========================================================
API_KEY = "#hidden"
ENDPOINT = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
API_VERSION = "2024-12-01-preview"
MODEL = "gpt-4o-mini"

client = AsyncAzureOpenAI(
    api_key=API_KEY,
    azure_endpoint=ENDPOINT,
    api_version=API_VERSION
)

# =========================================================
# STEP 3: Rate Control
# =========================================================
MAX_CONCURRENT_REQUESTS = 6
REQUESTS_PER_MINUTE = 1000
SLEEP_BETWEEN_REQUESTS = 60 / REQUESTS_PER_MINUTE
SAVE_INTERVAL = 50

# =========================================================
# STEP 4: Helper Functions
# =========================================================

def safe_json_parse(text):
    """Try to parse model output as JSON; fallback to regex."""
    try:
        return json.loads(text)
    except Exception:
        instructed = re.search(r'"?instructed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        reversed_q = re.search(r'"?reversed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        pos = re.search(r'"?positive[_ ]?doc"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        neg = re.search(r'"?hard[_ ]?negative[_ ]?doc"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        return {
            "instructed_query": instructed.group(1).strip() if instructed else "",
            "reversed_query": reversed_q.group(1).strip() if reversed_q else "",
            "positive_doc": pos.group(1).strip() if pos else "",
            "hard_negative_doc": neg.group(1).strip() if neg else ""
        }

def build_instruct_prompt(query, attributes):
    """Prompt for Instructed + Reversed query generation."""
    attr_text = ", ".join([f"{k}: {v}" for k, v in attributes.items() if v])
    return f"""
You are generating search queries with multiple document-level attributes.

Given a base query and its attributes, produce:
1. An instructed version that naturally includes all given attributes.
2. A reversed instructed version that logically negates them.

Return a JSON object:
{{
  "instructed_query": "...",
  "reversed_query": "..."
}}

Base query: "{query}"
Attributes: {attr_text}
"""

def build_doc_prompt(document, attributes):
    """Prompt for rewriting + hard negative generation."""
    attr_text = ", ".join([f"{k}: {v}" for k, v in attributes.items() if v])
    return f"""
You are refining a document for an information-retrieval benchmark.

Task 1 – Rewrite the base document so that it **fully satisfies all given attributes**.
Task 2 – Create one **hard negative document** that is **topically similar** but violates one or two attributes.

Return only JSON:
{{
  "positive_doc": "... rewritten version ...",
  "hard_negative_doc": "... violating version ..."
}}

Base document:
\"\"\"{document}\"\"\"
Attributes: {attr_text}
"""

def sample_attr_combinations(entry, num_combinations=5):
    """Generate 3–5 random 2–3 attribute combinations for a single query."""
    all_attrs = {
        "audience": entry.get("audience", ""),
        "format": entry.get("format", ""),
        "language": entry.get("language", ""),
        "length": entry.get("length", ""),
        "source": entry.get("source", "")
    }
    non_empty = {k: v for k, v in all_attrs.items() if v}
    num_combinations = random.randint(3, 5)

    combos = []
    for _ in range(num_combinations):
        chosen = random.sample(list(non_empty.keys()), k=min(random.randint(2, 3), len(non_empty)))
        combos.append({k: non_empty[k] for k in chosen})
    return combos

# =========================================================
# STEP 5: Async Generation
# =========================================================

async def generate_instructed_queries():
    input_file = "2_base_with_multi_attri.jsonl"
    output_file = "3_instructed_reverse_queries_with_attri.jsonl"

    with open(input_file, "r") as f:
        base_data = [json.loads(line) for line in f]

    print(f"📘 Loaded {len(base_data)} core queries")
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    processed = []

    async def process_combination(base_entry, combo, combo_id):
        """Generate instructed and reversed queries for one combination."""
        new_entry = copy.deepcopy(base_entry)
        new_entry["attributes"] = combo
        new_entry["combo_id"] = combo_id

        prompt = build_instruct_prompt(base_entry["query"], combo)
        try:
            response = await client.chat.completions.create(
                model=MODEL,
                temperature=0.4,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            parsed = safe_json_parse(response.choices[0].message.content)
            new_entry["instructed_query"] = parsed.get("instructed_query", "")
            new_entry["reversed_query"] = parsed.get("reversed_query", "")
            new_entry["query_type"] = "expanded"
            return new_entry
        except Exception as e:
            print(f"❌ Error on {base_entry.get('query_id')}: {e}")
            return None

    tasks = []
    for entry in base_data:
        combos = sample_attr_combinations(entry)
        for i, combo in enumerate(combos):
            tasks.append(process_combination(entry, combo, i+1))

    async def limited_task(task):
        async with semaphore:
            res = await task
            await asyncio.sleep(SLEEP_BETWEEN_REQUESTS)
            return res

    wrapped_tasks = [limited_task(t) for t in tasks]
    for i, coro in enumerate(asyncio.as_completed(wrapped_tasks), 1):
        res = await coro
        if res:
            processed.append(res)
        if i % SAVE_INTERVAL == 0:
            with open(output_file, "a") as f:
                for p in processed[-SAVE_INTERVAL:]:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"💾 Auto-saved {i} combinations...")

    with open(output_file, "a") as f:
        for p in processed:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"✅ Generated {len(processed)} instructed + reversed combinations!")

# =========================================================
# STEP 6: Document Rewriting
# =========================================================

async def rewrite_documents():
    input_file = "3_instructed_reverse_queries_with_attri.jsonl"
    output_file = "4_rewritten_docs_hard_negatives_with_attri.jsonl"

    with open(input_file, "r") as f:
        data = [json.loads(line) for line in f]

    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    processed = []

    async def process_doc(entry):
        attributes = entry.get("attributes", {})
        prompt = build_doc_prompt(entry.get("document", ""), attributes)
        try:
            response = await client.chat.completions.create(
                model=MODEL,
                temperature=0.5,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            parsed = safe_json_parse(response.choices[0].message.content)
            violated = random.sample(
                list(attributes.keys()), k=min(random.randint(1, 2), len(attributes))
            ) if attributes else []
            entry["positive_doc"] = parsed.get("positive_doc", "")
            entry["hard_negative_doc"] = parsed.get("hard_negative_doc", "")
            entry["violated_attributes"] = violated
            return entry
        except Exception as e:
            print(f"❌ Error on query_id={entry.get('query_id')}: {e}")
            return None

    tasks = [process_doc(e) for e in data]
    async def limited(entry_task):
        async with semaphore:
            res = await entry_task
            await asyncio.sleep(SLEEP_BETWEEN_REQUESTS)
            return res

    wrapped_tasks = [limited(t) for t in tasks]
    for i, coro in enumerate(asyncio.as_completed(wrapped_tasks), 1):
        res = await coro
        if res:
            processed.append(res)
        if i % SAVE_INTERVAL == 0:
            with open(output_file, "a") as f:
                for p in processed[-SAVE_INTERVAL:]:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"💾 Saved {i} rewritten docs...")

    with open(output_file, "a") as f:
        for p in processed:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"✅ Finished rewriting {len(processed)} documents!")

# =========================================================
# STEP 7: Run Sequentially (Colab-safe)
# =========================================================
await generate_instructed_queries()
await rewrite_documents()


📘 Loaded 715 core queries
💾 Auto-saved 50 combinations...
💾 Auto-saved 100 combinations...
💾 Auto-saved 150 combinations...
💾 Auto-saved 200 combinations...
💾 Auto-saved 250 combinations...
💾 Auto-saved 300 combinations...
💾 Auto-saved 350 combinations...
💾 Auto-saved 400 combinations...
💾 Auto-saved 450 combinations...
💾 Auto-saved 500 combinations...
💾 Auto-saved 550 combinations...
💾 Auto-saved 600 combinations...
💾 Auto-saved 650 combinations...
💾 Auto-saved 700 combinations...
💾 Auto-saved 750 combinations...
💾 Auto-saved 800 combinations...
💾 Auto-saved 850 combinations...
💾 Auto-saved 900 combinations...
💾 Auto-saved 950 combinations...
💾 Auto-saved 1000 combinations...
💾 Auto-saved 1050 combinations...
💾 Auto-saved 1100 combinations...
💾 Auto-saved 1150 combinations...
💾 Auto-saved 1200 combinations...
💾 Auto-saved 1250 combinations...
💾 Auto-saved 1300 combinations...
💾 Auto-saved 1350 combinations...
💾 Auto-saved 1400 combinations...
💾 Auto-saved 1450 combinations...
💾 Auto-s

In [None]:
import json

input_file = "4_rewritten_docs_hard_negatives_with_attri.jsonl"          # your input file
output_file = "final_sorted.jsonl"  # sorted output file

with open(input_file, "r") as f:
    data = [json.loads(line) for line in f if line.strip()]

def sort_key(entry):
    qid = str(entry.get("query_id", "")).strip()
    # Try to convert to int if purely numeric, else fallback to string
    return (0, int(qid)) if qid.isdigit() else (1, qid.lower())

data.sort(key=sort_key)

with open(output_file, "w") as f:
    for entry in data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"✅ Sorted {len(data)} records by query_id.")
print(f"📁 Output saved to: {output_file}")


**Evaluation Process - J**

In [None]:
!pip install pyserini tqdm numpy pandas -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.5/178.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyserini (pyproject.toml) ... [?25l[?25hdone


**Dense Retrieval Models**

*BM25*

In [None]:
!pip install rank_bm25 tqdm numpy pandas -q


In [3]:
import json, numpy as np, pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from collections import defaultdict

# ---------------- SAFE LOAD ----------------
def safe_load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"⚠ Skipping invalid JSON line {i}")
    return data

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
queries = safe_load_jsonl("final_sorted.jsonl")

# Build corpus and mappings
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

# Tokenize corpus
tokenized_corpus = [c.lower().split() for c in corpus]
bm25 = BM25Okapi(tokenized_corpus)
print(f"✅ Loaded {len(queries)} queries, {len(corpus)} documents.")

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk(query, k=10):
    if not query:
        return []
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    ranked_idx = np.argsort(scores)[::-1][:k]
    return [(doc_ids[i], float(scores[i])) for i in ranked_idx]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K used in metrics

for q in tqdm(queries, desc="Evaluating BM25"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"
        Rori = retrieve_topk(q.get("query", ""), k=K)
        Rins = retrieve_topk(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk(q.get("reversed_query", ""), k=K)

        # ranks
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        # fraction of attributes satisfied in gold doc
        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-correct non-linear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })
    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print("\n📊 BM25 Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

✅ Loaded 9596 queries, 9576 documents.


Evaluating BM25: 100%|██████████| 9596/9596 [20:12<00:00,  7.91it/s]


📊 BM25 Evaluation Metrics (SOP-aligned):
{
  "mSICR": 0.03511879949979158,
  "mWISE": -1.0332243928067535,
  "MDCR_strict": 0.01979991663192997,
  "MDCR_soft": 0.22087328053355565
}





*Bge-Large-v1.5*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "BAAI/bge-large-en-v1.5"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*E5-Large-v2*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "intfloat/e5-large-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*Instructor-XL*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "hkunlp/instructor-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*GTE-Qwen2*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "thenlper/gte-small"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*E5-Mistral-ins*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "intfloat/e5-mistral-7b-instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*GritLM*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "GritLM/GritLM-7B"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*SFR-Embedding-2-R*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "Salesforce/SFR-Embedding-2-R"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*NV-Embed-v2*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "nvidia/NV-Embed-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

**Point-wise Re-ranking Models**

*Mistral-ins-v0.2*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*Llama-3.1*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

**List-wise Re-ranking**

*Zephyr-beta*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "HuggingFaceH4/zephyr-7b-beta"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*RankVicuna-v1*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "microsoft/RankVicuna-v1"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))

*RankZephyr-v1*

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch, json, numpy as np, pandas as pd
from tqdm import tqdm

# ---------------- MODEL SETUP ----------------
model_name = "HuggingFaceH4/RankZephyr-7b-v1"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# ---------------- LOAD FILES ----------------
with open("query-doc.json", "r", encoding="utf-8") as f:
    qdoc = json.load(f)
with open("final_sorted.jsonl", "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]

# ---------------- CORPUS PREPARATION ----------------
corpus, doc_ids, qid_map = [], [], {}
for entry in qdoc:
    qid = entry.get("query_id")
    qid_map[qid] = []
    for doc in entry.get("documents", []):
        text = doc.get("text", "").strip()
        if not text:
            continue
        corpus.append(text)
        did = f"{qid}_{doc.get('doc_id', 'unk')}"
        doc_ids.append(did)
        qid_map[qid].append(did)

print(f"Encoding {len(corpus)} documents using {model_name}...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, batch_size=32, show_progress_bar=True)

# ---------------- RETRIEVAL HELPERS ----------------
def retrieve_topk_dense(query, k=10):
    if not query:
        return []
    query_emb = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k)
    return [(doc_ids[idx], float(cos_scores[idx])) for idx in top_results.indices]

def find_rank(docid, ranking):
    for i, (d, _) in enumerate(ranking):
        if d == docid:
            return i + 1
    return len(ranking) + 1

# ---------------- EVALUATION ----------------
results = []
K = 10  # top-K cutoff

for q in tqdm(queries, desc=f"Evaluating {model_name}"):
    try:
        qid = q.get("query_id")
        if qid not in qid_map:
            continue

        pos_doc = f"{qid}_doc_1"

        # Retrieve results for each query type
        Rori = retrieve_topk_dense(q.get("query", ""), k=K)
        Rins = retrieve_topk_dense(q.get("instructed_query", ""), k=K)
        Rrev = retrieve_topk_dense(q.get("reversed_query", ""), k=K)

        # Rank positions
        Rori_rank = find_rank(pos_doc, Rori)
        Rins_rank = find_rank(pos_doc, Rins)
        Rrev_rank = find_rank(pos_doc, Rrev)

        # ---------- mSICR ----------
        Iq = int((Rins_rank < Rori_rank) and (Rrev_rank > Rori_rank))

        # ---------- mWISE ----------
        m = len(q.get("attributes", {}))
        pos_field = q.get("positive_doc", "")
        pos_text = ""
        if isinstance(pos_field, dict):
            pos_text = pos_field.get("text", "").lower()
        elif isinstance(pos_field, str):
            pos_text = pos_field.lower()

        sat = sum(1 for v in q.get("attributes", {}).values() if str(v).lower() in pos_text)
        frac_satisfied = sat / max(1, m)
        frac_violated = 1 - frac_satisfied

        delta_ins = Rori_rank - Rins_rank
        delta_rev = Rrev_rank - Rori_rank

        # SOP-aligned nonlinear formulation
        reward = frac_satisfied * (1 - np.sqrt(abs(delta_ins) / K)) * (1 / np.sqrt(max(1, Rins_rank)))
        penalty = -frac_violated * (1 + np.sqrt(abs(delta_rev) / K))  # heavier penalty for reversed
        mwise = reward + penalty

        # ---------- MDCR ----------
        dims = list(q.get("attributes", {}).keys())
        mq = len(dims)
        top_k_docs = retrieve_topk_dense(q.get("query", ""), k=K)

        def satisfies(doc_text, attr_dict):
            doc_text = doc_text.lower()
            return {d: int(str(v).lower() in doc_text) for d, v in attr_dict.items()}

        strict_scores, soft_scores = [], []
        for docid, _ in top_k_docs:
            try:
                idx = doc_ids.index(docid)
                text = corpus[idx].lower()
                s = satisfies(text, q.get("attributes", {}))
                strict_scores.append(np.prod(list(s.values())) if mq > 0 else 0)
                soft_scores.append(sum(s.values()) / mq if mq > 0 else 0)
            except ValueError:
                continue

        MDCR_strict = max(strict_scores) if strict_scores else 0
        MDCR_soft = max(soft_scores) if soft_scores else 0

        results.append({
            "query_id": qid,
            "mSICR": Iq,
            "mWISE": mwise,
            "MDCR_strict": MDCR_strict,
            "MDCR_soft": MDCR_soft
        })

    except Exception as e:
        print(f"⚠ Skipping query {q.get('query_id', '?')} due to error: {e}")
        continue

# ---------------- AGGREGATION ----------------
df = pd.DataFrame(results)
metrics = {
    "model": model_name,
    "mSICR": df["mSICR"].mean(),
    "mWISE": df["mWISE"].mean(),
    "MDCR_strict": df["MDCR_strict"].mean(),
    "MDCR_soft": df["MDCR_soft"].mean()
}

print(f"\n📊 {model_name} Evaluation Metrics (SOP-aligned):")
print(json.dumps(metrics, indent=2))