# HQF-DE: Document Expansion Pipeline

Process the MS MARCO passages subset (1M documents with full qrels coverage) on Colab with GPU.

**Dataset:** `collection_subset.tsv` - 1M documents curated to include all qrels-relevant passages

**Estimated time for 100K docs:** ~5-6 hours on A100, ~15-20 hours on T4

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q torch transformers sentence-transformers scikit-learn pydantic-settings sentencepiece protobuf accelerate bitsandbytes tqdm

In [None]:
# Login to HuggingFace (required for Llama-3 access)
from huggingface_hub import login
login()  # Enter your HuggingFace token when prompted

In [None]:
# Mount Google Drive for checkpoints (important for long runs!)
from google.colab import drive
drive.mount('/content/drive')

# Create directories
!mkdir -p /content/drive/MyDrive/hqf_de/output
!mkdir -p /content/drive/MyDrive/hqf_de/checkpoints
!mkdir -p /content/data /content/cache

In [None]:
# Upload collection_subset.tsv (1M MS MARCO passages with qrels coverage)
from google.colab import files
print("Upload your collection_subset.tsv file (1M MS MARCO passages)")
print("This file contains all documents referenced in qrels for proper evaluation")
uploaded = files.upload()

import shutil
for filename in uploaded.keys():
    shutil.move(filename, f'/content/data/{filename}')
    print(f"Moved {filename} to /content/data/")

In [None]:
# Create the hqf_de package
!mkdir -p hqf_de/models hqf_de/pipeline

In [None]:
%%writefile hqf_de/config.py
from pydantic_settings import BaseSettings
from pydantic import Field
from pathlib import Path

class HQFDEConfig(BaseSettings):
    project_root: Path = Field(default=Path("/content"))
    data_dir: Path = Field(default=Path("/content/data"))
    output_dir: Path = Field(default=Path("/content/drive/MyDrive/hqf_de/output"))
    cache_dir: Path = Field(default=Path("/content/cache"))
    checkpoint_dir: Path = Field(default=Path("/content/drive/MyDrive/hqf_de/checkpoints"))

    llm_model_name: str = Field(default="meta-llama/Meta-Llama-3-8B-Instruct")
    llm_max_new_tokens: int = Field(default=256)
    llm_temperature: float = Field(default=0.7)

    doc2query_model: str = Field(default="castorini/doc2query-t5-base-msmarco")
    nli_model: str = Field(default="microsoft/deberta-v3-large-mnli")
    embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
    
    nli_entailment_threshold: float = Field(default=0.9)
    dedup_similarity_threshold: float = Field(default=0.85)
    max_doc_length: int = Field(default=512)
    device: str = Field(default="cuda")

config = HQFDEConfig()

In [None]:
%%writefile hqf_de/__init__.py
from .config import config
__all__ = ["config"]

In [None]:
%%writefile hqf_de/models/__init__.py
from .llm import LLM
from .doc2query import Doc2Query
from .nli import NLI
from .embeddings import Embedder
__all__ = ["LLM", "Doc2Query", "NLI", "Embedder"]

In [None]:
%%writefile hqf_de/models/llm.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from typing import List
from dataclasses import dataclass
import logging
from ..config import config

logger = logging.getLogger(__name__)

@dataclass
class Expansion:
    text: str
    gaps: List[str]
    expansions: List[str]

class LLM:
    GAP_PROMPT = """Analyze this document and list semantic gaps (max 5):
{document}

Gaps:"""

    EXPAND_PROMPT = """Generate brief factual expansions for this document:
{document}

Gaps: {gaps}

Expansions:"""

    def __init__(self, model: str = None, device: str = None, quantize: bool = True):
        self.model_name = model or config.llm_model_name
        self.device = device or config.device
        self.quantize = quantize
        self.model = None
        self.tokenizer = None
        self.pipe = None
        self._loaded = False

    def load(self):
        if self._loaded:
            return self
        logger.info(f"Loading LLM: {self.model_name}")
        quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) if self.quantize else None
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=config.cache_dir)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=config.cache_dir, quantization_config=quant_config, torch_dtype=torch.float16, device_map="auto")
        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer, max_new_tokens=config.llm_max_new_tokens, temperature=config.llm_temperature, do_sample=True)
        self._loaded = True
        return self

    def _fmt(self, doc: str, template: str, **kw) -> str:
        content = template.format(document=doc, **kw)
        return f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    def _gen(self, prompt: str) -> str:
        if not self._loaded:
            self.load()
        result = self.pipe(prompt, return_full_text=False, pad_token_id=self.tokenizer.pad_token_id)
        return result[0]["generated_text"].strip()

    def _parse(self, text: str) -> List[str]:
        items = []
        for line in text.split("\n"):
            line = line.strip()
            if line and not line.startswith("#"):
                if line[0].isdigit():
                    line = line.split(".", 1)[-1].strip()
                if line.startswith("-"):
                    line = line[1:].strip()
                if line and len(line) > 5:
                    items.append(line)
        return items[:5]

    def gaps(self, doc: str) -> List[str]:
        return self._parse(self._gen(self._fmt(doc, self.GAP_PROMPT)))

    def expand(self, doc: str, gaps: List[str] = None) -> List[str]:
        gaps_text = "\n".join(gaps) if gaps else "none"
        return self._parse(self._gen(self._fmt(doc, self.EXPAND_PROMPT, gaps=gaps_text)))

    def run(self, doc: str) -> Expansion:
        g = self.gaps(doc)
        e = self.expand(doc, g)
        return Expansion(text=doc, gaps=g, expansions=e)

    def unload(self):
        if self.model:
            del self.model, self.tokenizer, self.pipe
            self.model = self.tokenizer = self.pipe = None
            self._loaded = False
            torch.cuda.empty_cache()

In [None]:
%%writefile hqf_de/models/doc2query.py
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from typing import List
import logging
from ..config import config

logger = logging.getLogger(__name__)

class Doc2Query:
    def __init__(self, model: str = None, device: str = None, n: int = 5):
        self.model_name = model or config.doc2query_model
        self.device = device or config.device
        self.n = n
        self.model = None
        self.tokenizer = None
        self._loaded = False

    def load(self):
        if self._loaded:
            return self
        logger.info(f"Loading doc2query: {self.model_name}")
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name, cache_dir=config.cache_dir)
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name, cache_dir=config.cache_dir).to(self.device)
        self.model.eval()
        self._loaded = True
        return self

    def gen(self, doc: str, n: int = None) -> List[str]:
        if not self._loaded:
            self.load()
        n = n or self.n
        inputs = self.tokenizer(doc, max_length=config.max_doc_length, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_length=64, do_sample=True, top_k=10, num_return_sequences=n)
        queries = []
        for out in outputs:
            q = self.tokenizer.decode(out, skip_special_tokens=True).strip()
            if q and q not in queries:
                queries.append(q)
        return queries

    def unload(self):
        if self.model:
            del self.model, self.tokenizer
            self.model = self.tokenizer = None
            self._loaded = False
            torch.cuda.empty_cache()

In [None]:
%%writefile hqf_de/models/nli.py
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from typing import List, Tuple
from dataclasses import dataclass
import logging
from ..config import config

logger = logging.getLogger(__name__)

@dataclass
class NLIResult:
    hypothesis: str
    entailment: float
    valid: bool

class NLI:
    def __init__(self, model: str = None, device: str = None, threshold: float = 0.9):
        self.model_name = model or config.nli_model
        self.device = device or config.device
        self.threshold = threshold
        self.pipe = None
        self._loaded = False

    def load(self):
        if self._loaded:
            return self
        logger.info(f"Loading NLI: {self.model_name}")
        self.pipe = pipeline("text-classification", model=self.model_name, device=0, top_k=None)
        self._loaded = True
        return self

    def check(self, premise: str, hypothesis: str) -> NLIResult:
        if not self._loaded:
            self.load()
        try:
            results = self.pipe(f"{premise} [SEP] {hypothesis}", truncation=True, max_length=512)
            scores = {r["label"].lower(): r["score"] for r in results}
            ent = scores.get("entailment", 0.0)
            return NLIResult(hypothesis=hypothesis, entailment=ent, valid=ent >= self.threshold)
        except:
            return NLIResult(hypothesis=hypothesis, entailment=0.0, valid=False)

    def validate(self, doc: str, expansions: List[str]) -> Tuple[List[str], List[NLIResult]]:
        if not self._loaded:
            self.load()
        valid, results = [], []
        for exp in expansions:
            r = self.check(doc, exp)
            results.append(r)
            if r.valid:
                valid.append(exp)
        return valid, results

    def unload(self):
        if self.pipe:
            del self.pipe
            self.pipe = None
            self._loaded = False
            torch.cuda.empty_cache()

In [None]:
%%writefile hqf_de/models/embeddings.py
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple
import logging
from ..config import config

logger = logging.getLogger(__name__)

class Embedder:
    def __init__(self, model: str = None, device: str = None):
        self.model_name = model or config.embedding_model
        self.device = device or config.device
        self.model = None
        self._loaded = False

    def load(self):
        if self._loaded:
            return self
        logger.info(f"Loading embedder: {self.model_name}")
        self.model = SentenceTransformer(self.model_name, cache_folder=str(config.cache_dir), device=self.device)
        self._loaded = True
        return self

    def encode(self, texts: List[str]) -> np.ndarray:
        if not self._loaded:
            self.load()
        return self.model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)

    def dedup(self, texts: List[str], threshold: float = 0.85) -> Tuple[List[str], List[int]]:
        if len(texts) <= 1:
            return texts, list(range(len(texts)))
        embs = self.encode(texts)
        sim = cosine_similarity(embs)
        kept, indices, removed = [], [], set()
        for i in range(len(texts)):
            if i in removed:
                continue
            kept.append(texts[i])
            indices.append(i)
            for j in range(i + 1, len(texts)):
                if sim[i, j] >= threshold:
                    removed.add(j)
        return kept, indices

    def dedup_vs_doc(self, doc: str, expansions: List[str], threshold: float = 0.85) -> List[str]:
        if not expansions:
            return []
        doc_emb = self.encode([doc])
        exp_embs = self.encode(expansions)
        sims = cosine_similarity(exp_embs, doc_emb).flatten()
        return [e for e, s in zip(expansions, sims) if s < threshold]

    def select(self, expansions: List[str], n: int = 5, doc: str = None) -> List[str]:
        if len(expansions) <= n:
            return expansions
        embs = self.encode(expansions)
        if doc:
            doc_emb = self.encode([doc])
            rel = cosine_similarity(embs, doc_emb).flatten()
        else:
            rel = np.ones(len(expansions))
        selected, indices = [], []
        for _ in range(n):
            best_idx, best_score = -1, -float('inf')
            for i in range(len(expansions)):
                if i in indices:
                    continue
                div = 1 - max(cosine_similarity(embs[i:i+1], embs[indices]).flatten()) if indices else 1.0
                score = 0.5 * rel[i] + 0.5 * div
                if score > best_score:
                    best_score, best_idx = score, i
            if best_idx >= 0:
                selected.append(expansions[best_idx])
                indices.append(best_idx)
        return selected

    def unload(self):
        if self.model:
            del self.model
            self.model = None
            self._loaded = False
            torch.cuda.empty_cache()

In [None]:
%%writefile hqf_de/pipeline/__init__.py
from .expander import Expander
from .combiner import Combiner
__all__ = ["Expander", "Combiner"]

In [None]:
%%writefile hqf_de/pipeline/combiner.py
from typing import List, Dict, Any
from dataclasses import dataclass, field
from ..models.embeddings import Embedder

GENERIC = {"information", "details", "things", "stuff", "content", "topic", "subject", "matter", "example", "case", "way", "method", "people", "time", "place", "thing"}

@dataclass
class Combined:
    original: str
    semantic: List[str] = field(default_factory=list)
    queries: List[str] = field(default_factory=list)
    final: List[str] = field(default_factory=list)
    text: str = ""

class Combiner:
    def __init__(self, embedder: Embedder = None, threshold: float = 0.85, max_exp: int = 10):
        self.embedder = embedder or Embedder()
        self.threshold = threshold
        self.max_exp = max_exp
        self._loaded = False

    def _load(self):
        if not self._loaded:
            self.embedder.load()
            self._loaded = True

    def _filter(self, expansions: List[str], doc: str = None) -> List[str]:
        out = []
        for e in expansions:
            e = e.strip()
            if not e or len(e.split()) < 3 or len(e.split()) > 50:
                continue
            words = e.lower().split()
            if sum(1 for w in words if w in GENERIC) / len(words) > 0.5:
                continue
            if doc and e.lower() in doc.lower():
                continue
            out.append(e)
        return out

    def combine(self, doc: str, semantic: List[str], queries: List[str]) -> Combined:
        self._load()
        sem = self._filter(semantic, doc)
        q = self._filter(queries, doc)
        all_exp = sem + q
        if len(all_exp) > 1:
            all_exp, _ = self.embedder.dedup(all_exp, self.threshold)
            all_exp = self.embedder.dedup_vs_doc(doc, all_exp, self.threshold)
        final = self.embedder.select(all_exp, self.max_exp, doc) if len(all_exp) > self.max_exp else all_exp[:self.max_exp]
        text = f"{doc} {' '.join(final)}"
        return Combined(original=doc, semantic=sem, queries=q, final=final, text=text)

    def unload(self):
        if self._loaded:
            self.embedder.unload()
            self._loaded = False

In [None]:
%%writefile hqf_de/pipeline/expander.py
from typing import List, Dict, Any
from dataclasses import dataclass, field
import logging
from ..models.llm import LLM
from ..models.nli import NLI
from ..models.doc2query import Doc2Query
from ..models.embeddings import Embedder
from .combiner import Combiner
from ..config import config

logger = logging.getLogger(__name__)

@dataclass
class Result:
    doc_id: str
    original: str
    expanded: str
    gaps: List[str] = field(default_factory=list)
    raw: List[str] = field(default_factory=list)
    valid: List[str] = field(default_factory=list)
    queries: List[str] = field(default_factory=list)
    final: List[str] = field(default_factory=list)

class Expander:
    def __init__(self, use_llm: bool = True, use_nli: bool = True, use_d2q: bool = True):
        self.llm = LLM() if use_llm else None
        self.nli = NLI() if use_nli else None
        self.d2q = Doc2Query() if use_d2q else None
        self.combiner = Combiner(Embedder())
        self._loaded = False

    def load(self):
        if self._loaded:
            return self
        if self.llm:
            self.llm.load()
        if self.nli:
            self.nli.load()
        if self.d2q:
            self.d2q.load()
        self._loaded = True
        return self

    def expand(self, doc_id: str, doc: str) -> Result:
        if not self._loaded:
            self.load()
        result = Result(doc_id=doc_id, original=doc, expanded=doc)
        raw = []
        if self.llm:
            try:
                exp = self.llm.run(doc)
                result.gaps = exp.gaps
                raw = exp.expansions
                result.raw = raw
            except Exception as e:
                logger.error(f"LLM error: {e}")
        valid = raw
        if self.nli and raw:
            try:
                valid, _ = self.nli.validate(doc, raw)
                result.valid = valid
            except Exception as e:
                logger.error(f"NLI error: {e}")
        queries = []
        if self.d2q:
            try:
                queries = self.d2q.gen(doc)
                result.queries = queries
            except Exception as e:
                logger.error(f"D2Q error: {e}")
        try:
            combined = self.combiner.combine(doc, valid, queries)
            result.final = combined.final
            result.expanded = combined.text
        except Exception as e:
            logger.error(f"Combiner error: {e}")
            result.final = (valid + queries)[:10]
            result.expanded = f"{doc} {' '.join(result.final)}"
        return result

    def d2q_only(self, doc_id: str, doc: str) -> Result:
        if self.d2q and not self.d2q._loaded:
            self.d2q.load()
        queries = self.d2q.gen(doc) if self.d2q else []
        return Result(doc_id=doc_id, original=doc, expanded=f"{doc} {' '.join(queries)}", queries=queries, final=queries)

    def unload(self):
        if self.llm:
            self.llm.unload()
        if self.nli:
            self.nli.unload()
        if self.d2q:
            self.d2q.unload()
        self.combiner.unload()
        self._loaded = False

    def __enter__(self):
        return self.load()

    def __exit__(self, *args):
        self.unload()

## Load Models and Test

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

from hqf_de.pipeline.expander import Expander

# Quick test
exp = Expander()
exp.load()

test_doc = "The Eiffel Tower is a famous landmark in Paris, France. It was built in 1889."
result = exp.expand("test", test_doc)

print("\n" + "="*60)
print("TEST RESULT:")
print(f"Gaps: {result.gaps}")
print(f"Valid expansions: {result.valid}")
print(f"Queries: {result.queries}")
print(f"Final: {result.final}")
print("="*60)

## Process 100K Documents with Checkpointing

In [None]:
import csv
import json
import os
from tqdm import tqdm
from datetime import datetime
from hqf_de.config import config

# Configuration - Process 100K documents from the 1M subset
TOTAL_DOCS = 100000  # Change to 1000000 to process all 1M docs
CHECKPOINT_EVERY = 1000  # Save every 1000 docs
INPUT_FILE = '/content/data/collection_subset.tsv'  # 1M docs with qrels coverage
OUTPUT_FILE = str(config.output_dir / 'expanded_hqfde.tsv')
D2Q_OUTPUT_FILE = str(config.output_dir / 'expanded_d2q.tsv')
CHECKPOINT_FILE = str(config.checkpoint_dir / 'checkpoint.json')

def load_docs(path, limit=None, skip=0):
    """Load documents, optionally skipping already processed ones."""
    docs = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, row in enumerate(csv.reader(f, delimiter='\t')):
            if i < skip:
                continue
            if len(row) >= 2:
                docs.append((row[0], row[1]))
                if limit and len(docs) >= limit:
                    break
    return docs

def save_checkpoint(processed, output_file):
    """Save checkpoint to resume later."""
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({
            'processed': processed,
            'output_file': output_file,
            'timestamp': datetime.now().isoformat()
        }, f)
    print(f"Checkpoint saved: {processed} documents processed")

def load_checkpoint():
    """Load checkpoint if exists."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return None

def append_results(results, path):
    """Append results to TSV file."""
    mode = 'a' if os.path.exists(path) else 'w'
    with open(path, mode, encoding='utf-8', newline='') as f:
        w = csv.writer(f, delimiter='\t')
        for r in results:
            w.writerow([r.doc_id, r.expanded])

print(f"Configuration:")
print(f"  Input: {INPUT_FILE}")
print(f"  Total documents to process: {TOTAL_DOCS:,}")
print(f"  Checkpoint every: {CHECKPOINT_EVERY:,}")
print(f"  Output: {OUTPUT_FILE}")

In [None]:
# Check for existing checkpoint
checkpoint = load_checkpoint()
start_from = 0

if checkpoint:
    start_from = checkpoint['processed']
    print(f"Resuming from checkpoint: {start_from:,} documents already processed")
    print(f"Last saved: {checkpoint['timestamp']}")
else:
    print("Starting fresh - no checkpoint found")
    # Clear output files if starting fresh
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

remaining = TOTAL_DOCS - start_from
print(f"Documents remaining: {remaining:,}")

In [None]:
# Run HQF-DE expansion on 100K documents
from hqf_de.pipeline.expander import Expander
import time

# Load documents
print(f"Loading documents from {start_from:,}...")
docs = load_docs(INPUT_FILE, limit=TOTAL_DOCS-start_from, skip=start_from)
print(f"Loaded {len(docs):,} documents")

# Initialize expander (models already loaded from test)
if not exp._loaded:
    exp = Expander()
    exp.load()

# Process with checkpointing
batch_results = []
processed = start_from
start_time = time.time()

for i, (doc_id, text) in enumerate(tqdm(docs, desc="HQF-DE Expansion")):
    try:
        result = exp.expand(doc_id, text)
        batch_results.append(result)
    except Exception as e:
        print(f"Error on doc {doc_id}: {e}")
        # Save original doc on error
        from hqf_de.pipeline.expander import Result
        batch_results.append(Result(doc_id=doc_id, original=text, expanded=text))
    
    # Checkpoint every N documents
    if (i + 1) % CHECKPOINT_EVERY == 0:
        append_results(batch_results, OUTPUT_FILE)
        processed += len(batch_results)
        save_checkpoint(processed, OUTPUT_FILE)
        
        # Print stats
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining_docs = len(docs) - (i + 1)
        eta = remaining_docs / rate if rate > 0 else 0
        print(f"  Rate: {rate:.2f} docs/sec | ETA: {eta/3600:.1f} hours")
        
        batch_results = []

# Save final batch
if batch_results:
    append_results(batch_results, OUTPUT_FILE)
    processed += len(batch_results)
    save_checkpoint(processed, OUTPUT_FILE)

total_time = time.time() - start_time
print(f"\nHQF-DE Complete!")
print(f"Total processed: {processed:,} documents")
print(f"Total time: {total_time/3600:.2f} hours")
print(f"Output: {OUTPUT_FILE}")

In [None]:
# Now run Doc2Query-only baseline for comparison
print("\nRunning Doc2Query-only baseline...")

# Unload full pipeline, load d2q-only
exp.unload()
exp_d2q = Expander(use_llm=False, use_nli=False, use_d2q=True)
exp_d2q.load()

# Reload all docs for d2q
docs = load_docs(INPUT_FILE, limit=TOTAL_DOCS)
print(f"Processing {len(docs):,} documents with Doc2Query only...")

d2q_results = []
for doc_id, text in tqdm(docs, desc="Doc2Query"):
    result = exp_d2q.d2q_only(doc_id, text)
    d2q_results.append(result)

# Save d2q results
with open(D2Q_OUTPUT_FILE, 'w', encoding='utf-8', newline='') as f:
    w = csv.writer(f, delimiter='\t')
    for r in d2q_results:
        w.writerow([r.doc_id, r.expanded])

print(f"Doc2Query baseline saved: {D2Q_OUTPUT_FILE}")
exp_d2q.unload()

In [None]:
# Download results
from google.colab import files

print("Downloading expanded files...")
files.download(OUTPUT_FILE)
files.download(D2Q_OUTPUT_FILE)

print("\nDone! Files downloaded:")
print(f"  1. expanded_hqfde.tsv - Full HQF-DE pipeline")
print(f"  2. expanded_d2q.tsv - Doc2Query baseline")
print(f"\nThese files contain docs from collection_subset.tsv which has 100% qrels coverage.")
print(f"Use these with your C++ indexer for evaluation.")

## Summary Statistics

In [None]:
# Print summary statistics
import os

hqf_size = os.path.getsize(OUTPUT_FILE) / (1024*1024)
d2q_size = os.path.getsize(D2Q_OUTPUT_FILE) / (1024*1024)

print("="*60)
print("SUMMARY")
print("="*60)
print(f"Documents processed: {TOTAL_DOCS:,}")
print(f"Input file: collection_subset.tsv (1M docs with 100% qrels coverage)")
print(f"")
print(f"HQF-DE output: {hqf_size:.1f} MB")
print(f"Doc2Query output: {d2q_size:.1f} MB")
print(f"")
print("Next steps:")
print("  1. Download the TSV files")
print("  2. Run your C++ indexer on both files")
print("  3. Evaluate with qrels.dev.tsv, qrels.eval.one.tsv, qrels.eval.two.tsv")
print("  4. Compare nDCG@10 and Recall@1000 between HQF-DE and Doc2Query")
print("="*60)