In [40]:
import os
import torch
import numpy as np
import faiss
from torch import nn
from datasets import load_from_disk, Dataset
from transformers import (
    BertModel,
    BertTokenizer,
    BertConfig,
    TrainingArguments,
    Trainer
)

In [42]:
# 2) Load the crosswordqa_segmented dataset from disk
dataset_path = "crosswordqa_segmented"
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at: {dataset_path}")

ds = load_from_disk(dataset_path)
# print(ds)

# We assume ds has at least a "train" split, with fields: "clue", "segmented_answer"

# 3) Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [43]:
def tokenize_qa(batch):
    # batch["clue"] and batch["answer"] are lists of strings
    clue_encoding = tokenizer(
        batch["clue"],
        truncation=True,
        padding="max_length",
        max_length=32
    )
    answer_encoding = tokenizer(
        batch["answer"],
        truncation=True,
        padding="max_length",
        max_length=32
    )

    return {
        "clue_input_ids": clue_encoding["input_ids"],
        "clue_attention_mask": clue_encoding["attention_mask"],
        "answer_input_ids": answer_encoding["input_ids"],
        "answer_attention_mask": answer_encoding["attention_mask"],
    }

In [44]:
# We'll create a new train_dataset with "clue" and "answer" fields,
# copying from "clue" and "segmented_answer" in the ds["train"].
def map_to_train_fields(example):
    return {
        "clue": example["clue"],
        "answer": example["segmented_answer"]
    }


In [45]:
# Replace your processing code with:

# We map the training split to unify field names for tokenize_qa

train_dataset = ds["train"].map(map_to_train_fields, batched=True)

# Print a sample to debug
print("Sample before tokenization:")
# print(train_dataset[0])

# Try without batched mode first
train_dataset = train_dataset.filter(lambda x: x["clue"] and x["answer"])
train_dataset = train_dataset.map(tokenize_qa, batched=False)

# print(train_dataset)
train_dataset.save_to_disk("train_dataset_bert_path")

Sample before tokenization:


Filter: 100%|██████████| 6420790/6420790 [00:17<00:00, 361985.98 examples/s]
Map: 100%|██████████| 6420133/6420133 [28:52<00:00, 3705.76 examples/s]
Saving the dataset (15/15 shards): 100%|██████████| 6420133/6420133 [00:15<00:00, 413102.72 examples/s]


In [56]:
class BiEncoderQA(nn.Module):
    def _init_(self, model_name="bert-base-uncased"):
        super()._init_()  # Correct init for modern PyTorch
        self.clue_encoder = BertModel.from_pretrained(model_name)
        self.answer_encoder = BertModel.from_pretrained(model_name)
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, 
                clue_input_ids, clue_attention_mask,
                answer_input_ids, answer_attention_mask):
        clue_outputs = self.clue_encoder(input_ids=clue_input_ids, attention_mask=clue_attention_mask)
        clue_emb = clue_outputs.last_hidden_state[:, 0, :]

        answer_outputs = self.answer_encoder(input_ids=answer_input_ids, attention_mask=answer_attention_mask)
        answer_emb = answer_outputs.last_hidden_state[:, 0, :]

        sim_matrix = torch.matmul(clue_emb, answer_emb.transpose(0, 1))
        targets = torch.arange(sim_matrix.size(0)).to(sim_matrix.device)
        loss = self.loss_fct(sim_matrix, targets)

        return {
            "loss": loss,
            "sim_matrix": sim_matrix,
            "clue_emb": clue_emb,
            "answer_emb": answer_emb,
        }

In [58]:
# 4) Initialize the model
model = BiEncoderQA("bert-base-uncased")

# 5) Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./bi_encoder_qa",
    num_train_epochs=3,                # adjust for real training
    per_device_train_batch_size=8,     # tune for GPU RAM
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="no",  # or "steps"/"epoch" if you have a val set
    fp16=torch.cuda.is_available(),  # Use fp16 if GPU supports it
)

# 6) Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

TypeError: BiEncoderQA.__init__() takes 1 positional argument but 2 were given

In [None]:
# 7) Train the model (Optional)
# trainer.train()

In [None]:
# 8) Inference with FAISS (like the original example)
#    We assume you built a large answer_set file (answer_set_segmented.txt).
#    We'll just show how to embed & index it.

answer_file = "answer_set_segmented.txt"
if not os.path.exists(answer_file):
    print(f"❌ {answer_file} not found. Please provide a valid path.")
else:
    # Load your large answer set from file
    with open(answer_file, "r", encoding="utf-8") as f:
        file_answers = [line.strip() for line in f if line.strip()]

    print(f"✅ Loaded {len(file_answers)} answers from {answer_file}")

    # Function to compute embeddings for answers
    def compute_answer_embeddings(answer_list, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
        model.eval()
        embeddings = []
        with torch.no_grad():
            for ans in answer_list:
                encoded = tokenizer(
                    ans,
                    return_tensors="pt",
                    truncation=True,
                    padding=True,
                    max_length=32
                ).to(device)
                outputs = model.answer_encoder(**encoded)
                emb = outputs.last_hidden_state[:, 0, :]
                embeddings.append(emb.cpu().numpy())
        # shape: (num_answers, hidden_dim)
        return np.vstack(embeddings)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Compute embeddings
    answer_embeddings = compute_answer_embeddings(file_answers, model, tokenizer, device=device)
    dim = answer_embeddings.shape[1]

    # Build FAISS index for cosine similarity
    faiss.normalize_L2(answer_embeddings)
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(answer_embeddings)
    print(f"✅ Built FAISS index with {faiss_index.ntotal} embeddings.")

# Inference function
    def answer_for_clue(clue, model, tokenizer, faiss_index, answer_list, device="cuda" if torch.cuda.is_available() else "cpu", k=3):
        model.eval()
        encoded = tokenizer(clue, return_tensors="pt", truncation=True, padding=True, max_length=32).to(device)
        with torch.no_grad():
            clue_output = model.clue_encoder(**encoded)
            clue_emb = clue_output.last_hidden_state[:, 0, :]
            clue_emb = clue_emb.cpu().numpy()
        # Normalize clue for cosine
        faiss.normalize_L2(clue_emb)
        distances, indices = faiss_index.search(clue_emb, k)
        candidates = [answer_list[idx] for idx in indices[0]]
        return candidates

    # Test
    test_clue = "What is the capital of France?"
    predicted_answers = answer_for_clue(test_clue, model, tokenizer, faiss_index, file_answers, device=device, k=3)
    print("Clue:", test_clue)
    print("Candidate Answers:", predicted_answers)