**Azure Open AI Endpoint and Key:  GPT - 4o - Mini Model**

In [None]:
import os
from openai import AzureOpenAI

endpoint = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
model_name = "gpt-4o-mini"
deployment = "gpt-4o-mini"

subscription_key = "#hidden"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "I am going to Paris, what should I see?",
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

print(response.choices[0].message.content)

Paris is a city rich in history, art, culture, and beauty. Here’s a list of must-see attractions and experiences to consider during your visit:

1. **Eiffel Tower** - A symbol of Paris, you can either admire it from the ground or take an elevator to the top for stunning views of the city.

2. **Louvre Museum** - Home to thousands of works of art, including the Mona Lisa and the Venus de Milo. It's advisable to plan your visit, as it can be overwhelming due to its size.

3. **Notre-Dame Cathedral** - Although it is undergoing restoration, the exterior remains impressive. Explore the Île de la Cité while you’re there.

4. **Sacré-Cœur Basilica** - Located on Montmartre hill, this basilica offers beautiful views of Paris and features stunning mosaics inside.

5. **Champs-Élysées and Arc de Triomphe** - Stroll down this famous avenue and visit the iconic arch at the western end.

6. **Palace of Versailles** - A day trip from Paris, this opulent palace and its gardens are a perfect example 

In [1]:
pip install ir_datasets tqdm

Collecting ir_datasets
  Downloading ir_datasets-0.5.11-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.6.0-py3-none-any.whl.metadata (25 kB)
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.5-py3-none-any.whl.metadata (2.2 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting zlib-state>=0.1.3 (from ir_datasets)
  Downloading zlib_state-0.1.10-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting ijson>=3.1.3 (from ir_datasets)
  Downloading ijson-3.4.0.post0-

**Collecting Base Query Documents from msmarco and beir**

In [None]:
"""
FINAL Step A — Build base_dataset_questq.jsonl
Compatible with your available BEIR datasets.
Collects query–document pairs for multi-attribute dataset building.
"""

import ir_datasets, random, json
from tqdm import tqdm

# ✅ Choose only available dataset identifiers
SAMPLES = {
    "msmarco-passage/train": 400,            # open-domain search
    "beir/cqadupstack/programmers": 100,     # StackOverflow-style Q&A
    "beir/fever": 100,                       # fact verification
    "beir/scidocs": 50,                      # academic domain
    "beir/quora": 50,                        # question paraphrasing
}

random.seed(42)

def sample_dataset(name, sample_size):
    print(f"\n📘 Processing {name}")
    ds = ir_datasets.load(name)

    # Ensure dataset has the necessary parts
    for needed in ["queries_iter", "docs_iter", "qrels_iter"]:
        if not hasattr(ds, needed):
            raise AttributeError(f"Dataset {name} missing: {needed}")

    queries = {q.query_id: q.text for q in ds.queries_iter()}
    docs = {d.doc_id: d.text for d in ds.docs_iter()}
    qrels = list(ds.qrels_iter())

    # Build mapping query → list of relevant doc IDs
    pos_map = {}
    for q in qrels:
        if q.relevance > 0:
            pos_map.setdefault(q.query_id, []).append(q.doc_id)

    valid_qids = list(pos_map.keys())
    if not valid_qids:
        print(f"⚠️ No valid qrels for {name}")
        return []

    chosen_qids = random.sample(valid_qids, min(sample_size, len(valid_qids)))
    results = []

    for qid in tqdm(chosen_qids):
        query = queries.get(qid, "")
        pos_docs = pos_map.get(qid, [])
        for did in pos_docs[:3]:  # take up to 3 positive docs
            if did in docs:
                results.append({
                    "dataset": name,
                    "query_id": qid,
                    "query": query,
                    "document": docs[did],
                    "relevance": 1
                })

    print(f"✅ Collected {len(results)} pairs from {name}")
    return results


# ------------------ MAIN EXECUTION ------------------
all_data = []
for name, n in SAMPLES.items():
    try:
        all_data.extend(sample_dataset(name, n))
    except Exception as e:
        print(f"⚠️ Skipping {name}: {e}")

# Save the combined dataset
with open("base_dataset_questq.jsonl", "w", encoding="utf8") as f:
    for item in all_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\n🎯 Done! Collected {len(all_data)} total query–doc pairs.")
print("📂 Output saved to base_dataset_questq.jsonl")


**Adding Attributes to Base Dataset**

In [None]:
import json, time, random
from tqdm import tqdm
from openai import AzureOpenAI

# 🔹 Azure setup
endpoint = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
api_version = "2024-12-01-preview"
deployment = "gpt-4o-mini"
api_key = "#hidden"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=api_key
)

# 🔹 Load base dataset
with open("base_dataset_questq.jsonl") as f:
    base_data = [json.loads(l) for l in f]

# 🔹 Few-shot examples to guide GPT
FEWSHOT = """
Examples:
1️⃣ Query: "what is the purpose of DNA replication"
Document: "DNA replication ensures each cell gets an exact copy of the DNA during cell division."
Attributes:
{
 "audience": "Student",
 "keyword": ["Biology", "Genetics"],
 "format": "Academic Paper",
 "language": "English",
 "length": "Short",
 "source": "Wikipedia"
}

2️⃣ Query: "price of a bushel of wheat"
Document: "Interactive chart of historical daily wheat prices..."
Attributes:
{
 "audience": "Researcher",
 "keyword": ["Economics", "Agriculture"],
 "format": "Report",
 "language": "English",
 "length": "Short",
 "source": "NewsSite"
}
"""

# 🔹 Attribute generation function
def get_attributes(query, document, retries=2):
    prompt = f"""
You are labeling information retrieval data using InfoSearch-style attributes.

{FEWSHOT}

Now label this new pair.
Return ONLY a valid JSON dictionary (no explanations, no markdown).

Query: {query}
Document: {document[:800]}
JSON:
"""
    for _ in range(retries):
        try:
            resp = client.chat.completions.create(
                model=deployment,
                temperature=0.4,
                max_tokens=250,
                messages=[{"role": "user", "content": prompt}]
            )
            content = resp.choices[0].message.content.strip()
            # Clean and try parsing JSON
            start = content.find("{")
            end = content.rfind("}") + 1
            json_part = content[start:end]
            attrs = json.loads(json_part)
            # Ensure all expected keys exist
            required = {"audience", "keyword", "format", "language", "length", "source"}
            if required.issubset(attrs.keys()):
                return attrs
        except Exception as e:
            time.sleep(1)
            continue
    return None

# 🔹 Process all samples
enriched = []
for i, item in enumerate(tqdm(base_data, desc="Annotating")):
    attrs = get_attributes(item["query"], item["document"])
    if attrs:
        item.update(attrs)
        enriched.append(item)

# 🔹 Save
with open("multi_attr_dataset.jsonl", "w", encoding="utf8") as f:
    for e in enriched:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

print(f"✅ Created multi_attr_dataset.jsonl with {len(enriched)} labeled pairs.")


**Multi-Attribute Instructed + Reversed Query Generation**

In [9]:
#1. Install required packages ---
!pip install openai aiohttp nest_asyncio -q

#2. Imports and setup ---
import json, re, time, asyncio, nest_asyncio
from openai import AsyncAzureOpenAI
nest_asyncio.apply()

#3. Azure OpenAI configuration ---
API_KEY = "#hidden"
ENDPOINT = "https://areypragir-4130-gpt4omi-resource.cognitiveservices.azure.com/"
API_VERSION = "2024-12-01-preview"
MODEL = "gpt-4o-mini"

client = AsyncAzureOpenAI(
    api_key=API_KEY,
    azure_endpoint=ENDPOINT,
    api_version=API_VERSION
)

#4. Rate-limit and save settings ---
TOKENS_PER_MINUTE = 100_000
REQUESTS_PER_MINUTE = 1000
MAX_CONCURRENT_REQUESTS = 8
SLEEP_BETWEEN_REQUESTS = 60 / REQUESTS_PER_MINUTE
SAVE_INTERVAL = 50   # Auto-save every 50 queries

def build_prompt(query, attributes):
    """Prompt asking GPT to return instructed and reversed versions as strict JSON."""
    attr_text = ", ".join([f"{k}: {v}" for k, v in attributes.items() if v])
    return f"""
You are generating search queries with multiple document-level attributes.

Given a base query and its attributes, produce:
1. An instructed version that naturally includes 2–3 attributes.
2. A reversed instructed version that logically negates those same attributes.

Return output ONLY as a JSON object with two keys:
"instructed_query" and "reversed_query".

Example:
Base query: "best travel destinations in Europe"
Attributes: format=blog, language=English, audience=layman

Output:
{{
  "instructed_query": "List the best travel destinations in Europe. Please provide a blog in English for laymen.",
  "reversed_query": "List the best travel destinations in Europe. Please do not provide a blog in English for laymen."
}}

Now for this:
Base query: "{query}"
Attributes: {attr_text}
"""

def safe_json_parse(text):
    """Try robust JSON parsing; fallback to regex extraction if needed."""
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        instructed = re.search(r'"?instructed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        reversed_q = re.search(r'"?reversed[_ ]?query"?[:\-]?\s*["“](.*?)["”]', text, re.I | re.S)
        return {
            "instructed_query": instructed.group(1).strip() if instructed else "",
            "reversed_query": reversed_q.group(1).strip() if reversed_q else ""
        }

# Async GPT-4 call for one entry

async def process_entry(entry):
    query = entry["query"]
    attributes = {
        "audience": entry.get("audience", ""),
        "format": entry.get("format", ""),
        "language": entry.get("language", ""),
        "length": entry.get("length", ""),
        "source": entry.get("source", "")
    }

    prompt = build_prompt(query, attributes)

    try:
        response = await client.chat.completions.create(
            model=MODEL,
            temperature=0.4,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}  # force JSON output
        )

        content = response.choices[0].message.content.strip()
        parsed = safe_json_parse(content)

        entry["instructed_query"] = parsed.get("instructed_query", "")
        entry["reversed_query"] = parsed.get("reversed_query", "")
        return entry

    except Exception as e:
        print(f"❌ Error on {entry.get('query_id','?')}: {e}")
        return None


# Async batch generator

async def generate_instructed_queries():
    input_file = "multi_attr_dataset.jsonl"
    output_file = "multi_attr_instructed.jsonl"

    with open(input_file, "r") as f:
        data = [json.loads(line) for line in f]

    print(f"📘 Loaded {len(data)} entries from {input_file}")

    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    processed = []

    async def limited_process(entry):
        async with semaphore:
            result = await process_entry(entry)
            await asyncio.sleep(SLEEP_BETWEEN_REQUESTS)
            return result

    tasks = [limited_process(e) for e in data]

    for i, coro in enumerate(asyncio.as_completed(tasks), 1):
        result = await coro
        if result:
            processed.append(result)

        # Auto-save every N records
        if i % SAVE_INTERVAL == 0:
            with open(output_file, "a") as f:
                for p in processed[-SAVE_INTERVAL:]:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"💾 Auto-saved {i} entries...")

    # Final save
    with open(output_file, "a") as f:
        for p in processed:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"✅ Finished generating {len(processed)} instructed queries!")
    print(f"📁 Output: {output_file}")


await generate_instructed_queries()


📘 Loaded 715 entries from multi_attr_dataset.jsonl
💾 Auto-saved 50 entries...
💾 Auto-saved 100 entries...
💾 Auto-saved 150 entries...
💾 Auto-saved 200 entries...
💾 Auto-saved 250 entries...
💾 Auto-saved 300 entries...
💾 Auto-saved 350 entries...
💾 Auto-saved 400 entries...
💾 Auto-saved 450 entries...
💾 Auto-saved 500 entries...
💾 Auto-saved 550 entries...
💾 Auto-saved 600 entries...
💾 Auto-saved 650 entries...
💾 Auto-saved 700 entries...
✅ Finished generating 715 instructed queries!
📁 Output: multi_attr_instructed.jsonl
