In [2]:
# !pip install transformers
# !pip install datasets
# !pip install faiss-cpu
import os
import torch
import numpy as np
from torch import nn
from datasets import load_from_disk, Dataset
from transformers import (
    BertModel,
    BertTokenizer,
    BertConfig,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class BiEncoderQA(nn.Module):
    def __init__(self, model_name="bert-base-uncased"):
        super().__init__()  # Correct init for modern PyTorch
        self.clue_encoder = BertModel.from_pretrained(model_name)
        self.answer_encoder = BertModel.from_pretrained(model_name)
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self,
                clue_input_ids, clue_attention_mask,
                answer_input_ids, answer_attention_mask):
        clue_outputs = self.clue_encoder(input_ids=clue_input_ids, attention_mask=clue_attention_mask)
        clue_emb = clue_outputs.last_hidden_state[:, 0, :]

        answer_outputs = self.answer_encoder(input_ids=answer_input_ids, attention_mask=answer_attention_mask)
        answer_emb = answer_outputs.last_hidden_state[:, 0, :]

        sim_matrix = torch.matmul(clue_emb, answer_emb.transpose(0, 1))
        targets = torch.arange(sim_matrix.size(0)).to(sim_matrix.device)
        loss = self.loss_fct(sim_matrix, targets)

        return {
            "loss": loss,
            "sim_matrix": sim_matrix,
            "clue_emb": clue_emb,
            "answer_emb": answer_emb,
        }


In [3]:
# Unzip the file into a folder named 'crosswordqa_segmented'
!unzip /content/crosswordqa_segmented.zip -d /content/crosswordqa_segmented

Archive:  /content/crosswordqa_segmented.zip
   creating: /content/crosswordqa_segmented/crosswordqa_segmented/
  inflating: /content/crosswordqa_segmented/crosswordqa_segmented/dataset_dict.json  
   creating: /content/crosswordqa_segmented/crosswordqa_segmented/validation/
  inflating: /content/crosswordqa_segmented/crosswordqa_segmented/validation/data-00000-of-00001.arrow  
  inflating: /content/crosswordqa_segmented/crosswordqa_segmented/validation/dataset_info.json  
  inflating: /content/crosswordqa_segmented/crosswordqa_segmented/validation/state.json  


In [5]:
# 2) Load the crosswordqa_segmented dataset from disk
dataset_path = "/content/crosswordqa_segmented/crosswordqa_segmented"
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at: {dataset_path}")

ds = load_from_disk(dataset_path)
print(ds)

# We assume ds has at least a "train" split, with fields: "clue", "segmented_answer"



FileNotFoundError: No such files: '/content/crosswordqa_segmented/crosswordqa_segmented/train/dataset_info.json', nor '/content/crosswordqa_segmented/crosswordqa_segmented/train/state.json' found. Expected to load a `Dataset` object but provided path is not a `Dataset`.

In [None]:
def tokenize_qa(batch):
    # batch["clue"] and batch["answer"] are lists of strings
    clue_encoding = tokenizer(
        batch["clue"],
        truncation=True,
        padding="max_length",
        max_length=32
    )
    answer_encoding = tokenizer(
        batch["answer"],
        truncation=True,
        padding="max_length",
        max_length=32
    )

    return {
        "clue_input_ids": clue_encoding["input_ids"],
        "clue_attention_mask": clue_encoding["attention_mask"],
        "answer_input_ids": answer_encoding["input_ids"],
        "answer_attention_mask": answer_encoding["attention_mask"],
    }


In [None]:
# We'll create a new train_dataset with "clue" and "answer" fields,
# copying from "clue" and "segmented_answer" in the ds["train"].
def map_to_train_fields(example):
    return {
        "clue": example["clue"],
        "answer": example["segmented_answer"]
    }


In [None]:
# We map the training split to unify field names for tokenize_qa
train_dataset = ds["train"].map(map_to_train_fields,batched=True)
train_dataset = train_dataset.filter(lambda x: x["clue"] and x["answer"])
# Now tokenize
train_dataset = train_dataset.map(tokenize_qa, batched=True)

print(train_dataset)
train_dataset.save_to_disk("train_dataset_bert_path")

Map:   0%|          | 0/6420790 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6420790 [00:00<?, ? examples/s]

Map:   0%|          | 0/6420133 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'clue', 'answer', 'segmented_answer', 'clue_input_ids', 'clue_attention_mask', 'answer_input_ids', 'answer_attention_mask'],
    num_rows: 6420133
})


Saving the dataset (0/15 shards):   0%|          | 0/6420133 [00:00<?, ? examples/s]

In [4]:
train_dataset = load_from_disk("train_dataset_bert_path")
len(train_dataset)

6420133

In [19]:
small_train = train_dataset.train_test_split(test_size=0.10, seed=42)["test"]
small_train.save_to_disk("small_train_dataset10")

Saving the dataset (2/2 shards): 100%|██████████| 642014/642014 [04:07<00:00, 2591.37 examples/s]


In [None]:
import transformers
print("Transformers version:", transformers.__version__)
print("Transformers path:", transformers.__file__)


Transformers version: 4.51.1
Transformers path: d:\Restricted_project\Andi_Mandi_Shandi_iske_andar_gya_toh\BKL_aa_hi_gye_naa\toh_suno_TMKC\myenv\lib\site-packages\transformers\__init__.py


In [4]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

class BiEncoderDataCollator:
    def __call__(self, features):
        # Convert list of dicts into dict of lists
        batch = {
            "clue_input_ids": torch.tensor([f["clue_input_ids"] for f in features], dtype=torch.long),
            "clue_attention_mask": torch.tensor([f["clue_attention_mask"] for f in features], dtype=torch.long),
            "answer_input_ids": torch.tensor([f["answer_input_ids"] for f in features], dtype=torch.long),
            "answer_attention_mask": torch.tensor([f["answer_attention_mask"] for f in features], dtype=torch.long),
        }
        return batch


In [5]:
# 4) Initialize the model

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


from transformers import BertModel, BertTokenizer, BertConfig, TrainingArguments, Trainer
model = BiEncoderQA("bert-base-uncased")

# 5) Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./bi_encoder_qa",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1,
    eval_strategy="no",  # Disable evaluation during training
    fp16=torch.cuda.is_available()
)

# args = TrainingArguments(
#     output_dir="./test_output",
#     evaluation_strategy="epoch",
#     num_train_epochs=1,
#     per_device_train_batch_size=4
# )

print("✅ TrainingArguments initialized successfully!")




✅ TrainingArguments initialized successfully!


In [6]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0))


Torch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [7]:
# 6) Initialize Trainer
small_train=load_from_disk("small_train_dataset10")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    data_collator=BiEncoderDataCollator(), 
    tokenizer=tokenizer,
)
# 7) Train the model (Optional)
trainer.train()

  trainer = Trainer(


Step,Training Loss
100,8.1882
200,2.6962
300,2.3327
400,2.2219
500,2.1053
600,2.1116
700,2.0013
800,2.1023
900,1.9852
1000,1.885


TrainOutput(global_step=60189, training_loss=0.7824161881419253, metrics={'train_runtime': 13857.668, 'train_samples_per_second': 138.987, 'train_steps_per_second': 4.343, 'total_flos': 0.0, 'train_loss': 0.7824161881419253, 'epoch': 3.0})

In [8]:
save_path="bi_encoder_qa_model"
model.clue_encoder.save_pretrained(f"{save_path}/clue_encoder")
model.answer_encoder.save_pretrained(f"{save_path}/answer_encoder")
print(f"Model saved to {save_path}")

Model saved to bi_encoder_qa_model


In [9]:
tokenizer.save_pretrained(f"({save_path}/tokenizer")
print(f"Tokenizer saved to {save_path}/tokenizer")

Tokenizer saved to bi_encoder_qa_model/tokenizer


In [5]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

class BiEncoderQA(torch.nn.Module):
    def __init__(self, clue_encoder=None, answer_encoder=None):
        super().__init__()
        self.clue_encoder = clue_encoder or BertModel.from_pretrained("bert-base-uncased")
        self.answer_encoder = answer_encoder or BertModel.from_pretrained("bert-base-uncased")
        self.loss_fct = torch.nn.CrossEntropyLoss()

    def forward(self, clue_input_ids, clue_attention_mask, answer_input_ids, answer_attention_mask):
        clue_emb = self.clue_encoder(clue_input_ids, clue_attention_mask).last_hidden_state[:, 0, :]
        answer_emb = self.answer_encoder(answer_input_ids, answer_attention_mask).last_hidden_state[:, 0, :]
        sim_matrix = torch.matmul(clue_emb, answer_emb.transpose(0, 1))
        targets = torch.arange(sim_matrix.size(0)).to(sim_matrix.device)
        loss = self.loss_fct(sim_matrix, targets)
        return {
            "loss": loss,
            "sim_matrix": sim_matrix,
            "clue_emb": clue_emb,
            "answer_emb": answer_emb,
        }


# Load the encoders
clue_encoder = BertModel.from_pretrained("bi_encoder_qa_model/clue_encoder")
answer_encoder = BertModel.from_pretrained("bi_encoder_qa_model/answer_encoder")
tokenizer = BertTokenizer.from_pretrained("bi_encoder_qa_model/tokenizer")

# Initialize model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BiEncoderQA(clue_encoder, answer_encoder).to(device)

# Print model to confirm it's loaded correctly
print(f"Model loaded and moved to device: {device}")

# Load the answer set file (the same one used during training)
with open("answer_set_segmented.txt", "r", encoding="utf-8") as f:
    file_answers = [line.strip() for line in f if line.strip()]

print(f"✅ Loaded {len(file_answers)} answers.")

# Function to compute embeddings for answers in batches
def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda", batch_size=16):
    all_embeddings = []
    for i in tqdm(range(0, len(answer_list), batch_size), desc="Embedding answers"):
        batch = answer_list[i:i+batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, max_length=32, return_tensors="pt").to(device)
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                output = model.answer_encoder(**encoded)
                batch_emb = output.last_hidden_state[:, 0, :]  # [CLS] embedding
        all_embeddings.append(batch_emb.cpu().float().numpy())
    return np.vstack(all_embeddings)


Model loaded and moved to device: cuda
✅ Loaded 437721 answers.


In [6]:
model = BiEncoderQA(clue_encoder, answer_encoder).to(device)


In [5]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Now import faiss and run normally
import faiss

# Compute answer embeddings
answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device)

# Build FAISS index for cosine similarity
dim = answer_embeddings.shape[1]
faiss.normalize_L2(answer_embeddings)
faiss_index = faiss.IndexFlatIP(dim)
faiss_index.add(answer_embeddings)

print(f"✅ FAISS index built with {faiss_index.ntotal} entries.")

  with torch.cuda.amp.autocast():
Embedding answers: 100%|██████████| 27358/27358 [04:25<00:00, 102.93it/s]


✅ FAISS index built with 437721 entries.


In [7]:
# Inference function (search for top-k answers given a clue)
def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda", k=10):
    model.eval()
    encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
    with torch.no_grad():
        output = model.clue_encoder(**encoded)
        clue_emb = output.last_hidden_state[:, 0, :]
    clue_emb = clue_emb.cpu().numpy()
    faiss.normalize_L2(clue_emb)
    distances, indices = faiss_index.search(clue_emb, k)
    return [answer_list[i] for i in indices[0]]

# Test with a clue
test_clue = "Capital of France"
true_answer = "Paris"
predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=1000)

found = False
for i, ans in enumerate(predicted_answers, 1):
    # print(f"{i}. {ans}")
    if ans.strip().lower() == true_answer.strip().lower():
        print(f"\n🎯 Match found at rank {i}!")
        found = True

if not found:
    print("\n❌ Correct answer not found in top-k predictions.")



NameError: name 'faiss_index' is not defined

In [10]:
faiss.write_index(faiss_index, "faiss_index.bin")

In [9]:
import faiss
faiss_index = faiss.read_index("faiss_index.bin")

In [1]:
from datasets import load_from_disk

# Load the full dataset
ds = load_from_disk("crosswordqa_segmented")

# Take 10% of the test set
test_full = ds["validation"]
test_small = test_full.train_test_split(test_size=0.10, seed=42)["test"]

print(f"✅ Full test size: {len(test_full)}")
print(f"✅ 10% test subset size: {len(test_small)}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Full test size: 361458
✅ 10% test subset size: 36146


In [1]:
from tqdm import tqdm

k = 100
topk_hits = 0
total = len(test_small)

for example in tqdm(test_small, desc=f"Evaluating Top-{k} Accuracy"):
    clue = example["clue"]
    true_answer = example["segmented_answer"].strip().lower()

    predicted = answer_for_clue(clue, model, tokenizer, faiss_index, file_answers, device=device, k=k)
    predicted_clean = [p.strip().lower() for p in predicted]

    if true_answer in predicted_clean:
        topk_hits += 1

topk_accuracy = topk_hits / total
print(f"\n✅ Top-{k} Accuracy on {total} test examples: {topk_accuracy:.2%}")


NameError: name 'test_small' is not defined

In [3]:
import torch
import faiss
import numpy as np
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

# Load the answer list
with open("answer_set_segmented.txt", "r", encoding="utf-8") as f:
    file_answers = [line.strip() for line in f if line.strip()]

print(f"✅ Loaded {len(file_answers)} answers")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Function to embed answers in batches
def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda", batch_size=1024):
    all_embeddings = []
    for i in tqdm(range(0, len(answer_list), batch_size), desc="Embedding answers"):
        batch = answer_list[i:i+batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, max_length=32, return_tensors="pt").to(device)
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                output = model.answer_encoder(**encoded)
                batch_emb = output.last_hidden_state[:, 0, :]
        all_embeddings.append(batch_emb.cpu().float().numpy())
    return np.vstack(all_embeddings)

# Generate answer embeddings
answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device)

# Build FAISS index for cosine similarity
dim = answer_embeddings.shape[1]
faiss.normalize_L2(answer_embeddings)
faiss_index = faiss.IndexFlatIP(dim)
faiss_index.add(answer_embeddings)
print(f"✅ FAISS index built with {faiss_index.ntotal} entries")


✅ Loaded 437721 answers


NameError: name 'model' is not defined

In [None]:
# Reload later
from transformers import BertModel, BertTokenizer
clue_encoder = BertModel.from_pretrained("bi_encoder_model/clue_encoder")
answer_encoder = BertModel.from_pretrained("bi_encoder_model/answer_encoder")
tokenizer = BertTokenizer.from_pretrained("bi_encoder_model/tokenizer")

model = BiEncoderQA(clue_encoder, answer_encoder)

In [None]:
import faiss

# Load saved FAISS index
faiss_index = faiss.read_index("faiss_index.bin")

# Load the answer list
with open("answer_set_segmented.txt", "r", encoding="utf-8") as f:
    file_answers = [line.strip() for line in f if line.strip()]


In [None]:
def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda" if torch.cuda.is_available() else "cpu", k=3):
    model.eval()
    encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
    with torch.no_grad():
        clue_output = model.clue_encoder(**encoded)
        clue_emb = clue_output.last_hidden_state[:, 0, :]  # CLS token
        clue_emb = clue_emb.cpu().numpy()
    # Normalize for cosine similarity
    faiss.normalize_L2(clue_emb)
    distances, indices = faiss_index.search(clue_emb, k)
    return [answer_list[i] for i in indices[0]]


In [None]:
test_clue = "Capital of France"
predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers)
print("Clue:", test_clue)
print("Top predictions:", predicted_answers)


In [9]:
# 8) Inference with FAISS (like the original example)
#    We assume you built a large answer_set file (answer_set_segmented.txt).
#    We'll just show how to embed & index it.

answer_file = "/content/answer_set_no_segment.txt"
if not os.path.exists(answer_file):
    print(f"❌ {answer_file} not found. Please provide a valid path.")
else:
    # Load your large answer set from file
    with open(answer_file, "r", encoding="utf-8") as f:
        file_answers = [line.strip() for line in f if line.strip()]

    print(f"✅ Loaded {len(file_answers)} answers from {answer_file}")

    # Function to compute embeddings for answers
    def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
        model.eval()
        embeddings = []
        with torch.no_grad():
            for ans in answer_list:
                encoded = tokenizer(
                    ans,
                    return_tensors="pt",
                    truncation=True,
                    padding=True,
                    max_length=32
                ).to(device)
                outputs = model.answer_encoder(**encoded)
                emb = outputs.last_hidden_state[:, 0, :]
                embeddings.append(emb.cpu().numpy())
        # shape: (num_answers, hidden_dim)
        return np.vstack(embeddings)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Compute embeddings
    answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device)
    dim = answer_embeddings.shape[1]

    # Build FAISS index for cosine similarity
    faiss.normalize_L2(answer_embeddings)
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(answer_embeddings)
    print(f"✅ Built FAISS index with {faiss_index.ntotal} embeddings.")

# Inference function
    def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda" if torch.cuda.is_available() else "cpu", k=3):
        model.eval()
        encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
        with torch.no_grad():
            clue_output = model.clue_encoder(**encoded)
            clue_emb = clue_output.last_hidden_state[:, 0, :]
            clue_emb = clue_emb.cpu().numpy()
        # Normalize clue for cosine
        faiss.normalize_L2(clue_emb)
        distances, indices = faiss_index.search(clue_emb, k)
        candidates = [answer_list[idx] for idx in indices[0]]
        return candidates

    # Test
    test_clue = "What is the capital of France?"
    predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=3)
    print("Clue:", test_clue)
    print("Candidate Answers:", predicted_answers)

✅ Loaded 437721 answers from /content/answer_set_no_segment.txt


KeyboardInterrupt: 

In [11]:
# 8) Inference with FAISS (like the original example)
#    We assume you built a large answer_set file (answer_set_segmented.txt).
#    We'll just show how to embed & index it.

import os
import torch
import numpy as np
import faiss
from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer
from tqdm import tqdm  # Import tqdm for progress tracking

answer_file = "/content/answer_set_no_segment.txt"
if not os.path.exists(answer_file):
    print(f"❌ {answer_file} not found. Please provide a valid path.")
else:
    # Load your large answer set from file
    with open(answer_file, "r", encoding="utf-8") as f:
        file_answers = [line.strip() for line in f if line.strip()]

    print(f"✅ Loaded {len(file_answers)} answers from {answer_file}")

    # Function to compute embeddings for answers with progress tracking
    def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu", batch_size=4096):
      """
      Compute embeddings in batches, with mixed precision.
      batch_size=4096 is a good starting point; increase it until you
      fill ~12–14 GB of your 16 GB GPU.
      """
      model.eval()
      # Cast model to half precision for faster inference & lower memory
      model.answer_encoder.half()

      all_embs = []
      with torch.no_grad():
          for i in tqdm(range(0, len(answer_list), batch_size), desc="Embedding answers"):
              batch = answer_list[i : i + batch_size]
              # Tokenize batch: dynamic pad to max in batch
              encoded = tokenizer(
                  batch,
                  return_tensors="pt",
                  truncation=True,
                  padding=True,      # pad to longest in this batch
                  max_length=32
              ).to(device)

              # Mixed‑precision inference
              with torch.cuda.amp.autocast():
                  outputs = model.answer_encoder(**encoded)
                  # take CLS token
                  emb = outputs.last_hidden_state[:, 0, :]

              # Move to CPU and convert to numpy
              all_embs.append(emb.cpu().float().numpy())

      # Stack into (N, D)
      return np.vstack(all_embs)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Compute embeddings
    answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device)
    dim = answer_embeddings.shape[1]

    # Build FAISS index for cosine similarity
    faiss.normalize_L2(answer_embeddings)  # normalize embeddings for cosine similarity
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(answer_embeddings)  # adds embeddings to the FAISS index
    print(f"✅ Built FAISS index with {faiss_index.ntotal} embeddings.")

    # Inference function remains the same
    def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda" if torch.cuda.is_available() else "cpu", k=3):
        model.eval()
        encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
        with torch.no_grad():
            clue_output = model.clue_encoder(**encoded)
            clue_emb = clue_output.last_hidden_state[:, 0, :]  # CLS token
            clue_emb = clue_emb.cpu().numpy()
        # Normalize clue embedding for cosine similarity
        faiss.normalize_L2(clue_emb)
        distances, indices = faiss_index.search(clue_emb, k)
        # Retrieve top k answer candidates using the indices
        candidates = [answer_list[idx] for idx in indices[0]]
        return candidates

    # Test inference with a sample clue
    test_clue = "What is the capital of France?"
    predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=3)
    print("Clue:", test_clue)
    print("Candidate Answers:", predicted_answers)


✅ Loaded 437721 answers from /content/answer_set_no_segment.txt


  with torch.cuda.amp.autocast():
Embedding answers: 100%|██████████| 107/107 [02:06<00:00,  1.18s/it]


✅ Built FAISS index with 437721 embeddings.
Clue: What is the capital of France?
Candidate Answers: ['what is sorare', 'what is the title of', 'what is the name for a']


In [31]:
from datasets import load_from_disk
from tqdm import tqdm
import torch
import numpy as np
import time
from typing import List, Dict, Any

class BatchEvaluator:
    def __init__(self, model, tokenizer, faiss_index, file_answers, device, batch_size=16, k=10):
        self.model = model
        self.tokenizer = tokenizer
        self.faiss_index = faiss_index
        self.file_answers = file_answers
        self.device = device
        self.batch_size = batch_size
        self.k = k

        # Try to inspect the model's embedding function to use it directly
        # This avoids the issue with forward() needing both query and answer
        if hasattr(model, 'get_clue_embeddings'):
            print("Using model's get_clue_embeddings method")
            self.embed_function = model.get_clue_embeddings
        elif hasattr(model, 'encode_query'):
            print("Using model's encode_query method")
            self.embed_function = model.encode_query
        else:
            print("No specific embedding function found, will extract from model directly")
            self.embed_function = None

    def get_embeddings(self, input_ids, attention_mask=None):
        """Extract embeddings directly from the CLM or encoder model component"""
        # Check if the model has a specific clue encoder component
        if hasattr(self.model, 'clue_encoder'):
            encoder = self.model.clue_encoder
            outputs = encoder(input_ids, attention_mask=attention_mask)
            return outputs.last_hidden_state[:, 0, :]
        # Or check if it has a query encoder
        elif hasattr(self.model, 'query_encoder'):
            encoder = self.model.query_encoder
            outputs = encoder(input_ids, attention_mask=attention_mask)
            return outputs.last_hidden_state[:, 0, :]
        # Otherwise try to access the backbone model directly
        elif hasattr(self.model, 'transformer'):
            encoder = self.model.transformer
            outputs = encoder(input_ids, attention_mask=attention_mask)
            return outputs.last_hidden_state[:, 0, :]
        else:
            # No direct encoder access, return None to indicate we need to use the forward method
            return None

    def process_single_clue(self, clue: str):
        """Process a single clue"""
        # Tokenize the query input
        inputs = self.tokenizer(clue, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)

        # Get embeddings
        with torch.no_grad():
            # Extract input tensors
            query_input_ids = inputs['input_ids']
            query_attention_mask = inputs.get('attention_mask', None)

            # First try to use a dedicated embedding function
            if self.embed_function is not None:
                if query_attention_mask is not None:
                    query_emb = self.embed_function(query_input_ids, query_attention_mask)
                else:
                    query_emb = self.embed_function(query_input_ids)
            else:
                # Try to get embeddings directly from the encoder component
                query_emb = self.get_embeddings(query_input_ids, query_attention_mask)

                # If that doesn't work, we need to bypass the forward method
                if query_emb is None:
                    # Inspect model to find encoders
                    if hasattr(self.model, 'clue_encoder') and self.model.clue_encoder is not None:
                        # Use the clue encoder directly
                        outputs = self.model.clue_encoder(query_input_ids, attention_mask=query_attention_mask)
                        query_emb = outputs.last_hidden_state[:, 0, :]
                    else:
                        # Create dummy answer inputs with matching dtype
                        # Get the dtype of the query inputs
                        input_dtype = query_input_ids.dtype

                        # Create dummy answer inputs matching the dtype
                        answer_input_ids = torch.zeros((1, 1), dtype=input_dtype).to(self.device)
                        answer_attention_mask = torch.ones((1, 1), dtype=input_dtype).to(self.device)

                        try:
                            # Try to access internal model attributes to get embeddings directly
                            # This avoids the similarity calculation that's causing the dtype error

                            # Save the original forward method
                            original_forward = self.model.forward

                            # Define a custom forward that just returns the embeddings
                            def custom_forward(self_model, clue_input_ids, clue_attention_mask, answer_input_ids, answer_attention_mask):
                                # Get the clue encoder outputs
                                clue_outputs = self_model.clue_encoder(
                                    clue_input_ids,
                                    attention_mask=clue_attention_mask
                                )
                                # Return just the clue embeddings
                                return clue_outputs.last_hidden_state[:, 0, :]

                            # Replace the forward method temporarily
                            self.model.forward = custom_forward.__get__(self.model, type(self.model))

                            # Call the model with our custom forward
                            query_emb = self.model(
                                query_input_ids,
                                query_attention_mask,
                                answer_input_ids,
                                answer_attention_mask
                            )

                        except Exception as e:
                            print(f"Error extracting embeddings: {e}")
                            # If the custom approach fails, print a detailed model structure
                            print(f"Model structure: {dir(self.model)}")
                            raise
                        finally:
                            # Restore the original forward method
                            self.model.forward = original_forward

        # Convert to numpy for FAISS - ensure it's the right shape
        if len(query_emb.shape) == 3:  # [batch, seq_len, hidden]
            query_emb = query_emb[:, 0, :]  # Take CLS token

        query_vector = query_emb.cpu().numpy()

        # Search in FAISS index
        scores, indices = self.faiss_index.search(query_vector, self.k)

        # Get the actual answers
        predictions = [self.file_answers[idx] for idx in indices[0]]

        return predictions

    def process_batch(self, clues: List[str]) -> List[List[str]]:
        """Process clues one by one"""
        all_predictions = []

        # Process each clue individually
        for clue in clues:
            predictions = self.process_single_clue(clue)
            all_predictions.append(predictions)

        return all_predictions

    def evaluate(self, validation_set):
        """Evaluate the model on the entire validation set"""
        total_examples = len(validation_set)

        # Initialize counters
        top1_hits = 0
        topk_hits = 0
        total = 0

        # Create progress bar
        pbar = tqdm(total=total_examples, desc="Evaluating")
        pbar.set_postfix_str(f"Top-1: 0.00% | Top-{self.k}: 0.00%")

        start_time = time.time()
        last_update_time = start_time

        # Process the dataset in batches
        for i in range(0, total_examples, self.batch_size):
            # Get batch indices
            end_idx = min(i + self.batch_size, total_examples)
            batch = validation_set.select(range(i, end_idx))

            # Extract clues and answers
            batch_clues = batch["clue"]
            batch_true_answers = [ans.strip().lower() for ans in batch["segmented_answer"]]

            # Get predictions for all clues in batch
            batch_preds = self.process_batch(batch_clues)

            # Process results
            for true_answer, preds in zip(batch_true_answers, batch_preds):
                preds_clean = [p.strip().lower() for p in preds]

                total += 1
                if true_answer == preds_clean[0]:
                    top1_hits += 1
                if true_answer in preds_clean:
                    topk_hits += 1

            # Update progress bar
            pbar.update(len(batch))

            # Update metrics display more frequently
            current_time = time.time()
            if total % 100 == 0 or (current_time - last_update_time) > 5:
                top1_acc_current = top1_hits / total
                topk_acc_current = topk_hits / total
                elapsed = current_time - start_time
                examples_per_second = total / elapsed if elapsed > 0 else 0
                eta = (total_examples - total) / examples_per_second if examples_per_second > 0 else 0

                pbar.set_postfix_str(
                    f"Top-1: {top1_acc_current:.2%} | Top-{self.k}: {topk_acc_current:.2%} | "
                    f"Speed: {examples_per_second:.1f} ex/s | ETA: {eta/60:.1f}m"
                )
                last_update_time = current_time

        pbar.close()

        # Final accuracy results
        top1_acc = top1_hits / total
        topk_acc = topk_hits / total

        return {
            "top1_accuracy": top1_acc,
            "topk_accuracy": topk_acc,
            "k": self.k,
            "total_examples": total
        }
    def inspect_model(self):
      """Print detailed model information to help debug"""
      print("\nModel Information:")
      print(f"Model type: {type(self.model)}")
      print(f"Model attributes: {dir(self.model)}")

      # Try to access encoder components
      if hasattr(self.model, 'clue_encoder'):
          print(f"Clue encoder type: {type(self.model.clue_encoder)}")
      if hasattr(self.model, 'answer_encoder'):
          print(f"Answer encoder type: {type(self.model.answer_encoder)}")

      # Check if model has specific embedding methods
      for method in ['get_clue_embeddings', 'encode_query', 'get_query_embeddings']:
          if hasattr(self.model, method):
              print(f"Model has {method} method")

# Add this method to the BatchEvaluator class
# Call it in the __init__ method to debug model structure

In [35]:
# Create evaluator instance
evaluator = BatchEvaluator(
    model=model,
    tokenizer=tokenizer,
    faiss_index=faiss_index,
    file_answers=file_answers,
    device=device,
    batch_size=32,
    k=10
)

# Add this debugging step
evaluator.inspect_model()

# Test with a single example first before running full evaluation
test_clue = val_set[0]["clue"]
test_result = evaluator.process_single_clue(test_clue)
print(f"Test clue: {test_clue}")
print(f"Test result: {test_result}")
print(f"Expected: {val_set[0]['segmented_answer']}")

# If the test is successful, run the full evaluation
# results = evaluator.evaluate(val_set)

No specific embedding function found, will extract from model directly

Model Information:
Model type: <class '__main__.BiEncoderQA'>
Model attributes: ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hook

In [9]:
import os
import torch
import numpy as np
import faiss
from transformers import BertModel, BertTokenizer
from tqdm import tqdm

answer_file = "answer_set_segmented.txt"
if not os.path.exists(answer_file):
    print(f"❌ {answer_file} not found. Please provide a valid path.")
else:
    # Load your large answer set from file
    with open(answer_file, "r", encoding="utf-8") as f:
        file_answers = [line.strip() for line in f if line.strip()]

    print(f"✅ Loaded {len(file_answers)} answers from {answer_file}")

    # Function to compute embeddings for answers in batches with mixed precision
    def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu", batch_size=64):
        model.eval()
        all_embeddings = []
        # Process in large batches
        for i in tqdm(range(0, len(answer_list), batch_size), desc="Embedding answers"):
            batch_answers = answer_list[i:i+batch_size]
            # Tokenize batch: dynamic padding to the longest example in the batch
            encoded = tokenizer(
                batch_answers,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=32
            ).to(device)

            # Use mixed precision (remove device_type, just set dtype)
            with torch.cuda.amp.autocast(dtype=torch.float16):
                outputs = model.answer_encoder(**encoded)
                batch_emb = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding

            # Move batch embeddings to CPU and convert to float32
            all_embeddings.append(batch_emb.cpu().float().detach().numpy())
        return np.vstack(all_embeddings)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Compute embeddings in batches
    answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device, batch_size=8)
    dim = answer_embeddings.shape[1]

    # Build FAISS index for cosine similarity
    faiss.normalize_L2(answer_embeddings)
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(answer_embeddings)
    print(f"✅ Built FAISS index with {faiss_index.ntotal} embeddings.")

    # Inference function remains the same:
    def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda" if torch.cuda.is_available() else "cpu", k=3):
        model.eval()
        encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
        with torch.no_grad():
            clue_output = model.clue_encoder(**encoded)
            clue_emb = clue_output.last_hidden_state[:, 0, :]  # CLS token
            clue_emb = clue_emb.cpu().numpy()
        # Normalize clue embedding for cosine similarity
        faiss.normalize_L2(clue_emb)
        distances, indices = faiss_index.search(clue_emb, k)
        candidates = [answer_list[idx] for idx in indices[0]]
        return candidates

    # Test inference with a sample clue
    test_clue = "What is the capital of France?"
    predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=3)
    print("Clue:", test_clue)
    print("Candidate Answers:", predicted_answers)


✅ Loaded 437721 answers from answer_set_segmented.txt


NameError: name 'model' is not defined

In [9]:
faiss.write_index(faiss_index, "faiss_index.bin")
print("✅ FAISS index saved to faiss_index.bin")

✅ FAISS index saved to faiss_index.bin


In [8]:
np.save(embeddings_file, answer_embeddings)
print(f"✅ Saved embeddings to {embeddings_file}.")

NameError: name 'embeddings_file' is not defined

In [12]:
# Test inference with a sample clue
test_clue = "Milky gemstones"
predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=1000)
print("Clue:", test_clue)
print("Candidate Answers:", predicted_answers)

Clue: Milky gemstones
Candidate Answers: ['gemstones', 'precious gems', 'rare gems', 'sapphires', 'precious stones', 'witch hazels', 'pipestones', 'egg beakers', 'stone sandsticks', 'crystal gazers', 'iron pellets', 'fairy cakes', 'gallic stones', 'diamondearrings', 'curling stones', 'garnets', 'ribalds', 'lodestones', 'dilithium crystals', 'glassshards', 'korn flakes', 'woodenware', 'holy stones', 'rare metals', 'tea cakes', 'tinfoils', 'precious metals', 'capstones', 'prisms', 'jewel boxes', 'tilestones', 'dyeweeds', 'granites', 'adozen redroses', 'power teals', 'beet sandpieces', 'emeralds', 'ice pellets', 'brawnflakes', 'mints', 'tapioca pearls', 'microbursts', 'green clovers', 'honeycombs', 'naval oranges', 'watchbands', 'golden raisins', 'wheelhorses', 'pearly gametes', 'water melons', 'poultices', 'mineral spirits', 'lotuses', 'talismans', 'nut shells', 'fruit preserves', 'friedonion rings', 'robin sandroses', 'stinkweeds', 'crescent moons', 'choice morsels', 'candy canes', 'poe