In [6]:
import os
import json
import zipfile
import random
from typing import Optional, Tuple, Dict, List

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn import model_selection
from sklearn.metrics import f1_score, precision_recall_curve

from sentence_transformers import SentenceTransformer, CrossEncoder

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

DEFAULT_EXTRACT_DIR = "data/quora_question_pairs"
DEFAULT_EXTRACTED_CSV = os.path.join(DEFAULT_EXTRACT_DIR, "train.csv")
LOCAL_CSV_CANDIDATES = [
    "data/quora/train.csv",
    "data/train.csv",
    "dataset/train.csv",
    "train.csv",
]
LOCAL_ZIP_CANDIDATES = [
    "data/quora/train.csv.zip",
    "data/train.csv.zip",
    "dataset/train.csv.zip",
    "train.csv.zip",
]

def get_device() -> str:
    return "cuda" if torch.cuda.is_available() else "cpu"

def trainer_amp_args() -> Dict[str, bool]:
    if torch.cuda.is_available():
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            return {"bf16": True, "fp16": False}
        return {"bf16": False, "fp16": True}
    return {"bf16": False, "fp16": False}

def set_global_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def ensure_data_csv(data_csv: Optional[str]) -> str:
    if data_csv:
        if os.path.exists(data_csv):
            print(f"[data] Using provided CSV: {data_csv}")
            return data_csv
        raise FileNotFoundError(f"--data_csv provided but not found: {data_csv}")
    for cand in LOCAL_CSV_CANDIDATES:
        if os.path.exists(cand):
            print(f"[data] Found CSV at: {cand}")
            return cand
    for zpath in LOCAL_ZIP_CANDIDATES:
        if os.path.exists(zpath):
            print(f"[data] Found ZIP at: {zpath}")
            os.makedirs(DEFAULT_EXTRACT_DIR, exist_ok=True)
            with zipfile.ZipFile(zpath, "r") as zip_ref:
                zip_ref.extractall(DEFAULT_EXTRACT_DIR)
            if os.path.exists(DEFAULT_EXTRACTED_CSV):
                print(f"[data] Extracted to: {DEFAULT_EXTRACTED_CSV}")
                return DEFAULT_EXTRACTED_CSV
            for root, _, files in os.walk(DEFAULT_EXTRACT_DIR):
                if "train.csv" in files:
                    found = os.path.join(root, "train.csv")
                    print(f"[data] Extracted and found: {found}")
                    return found
            raise FileNotFoundError(f"ZIP extracted, but train.csv not found under {DEFAULT_EXTRACT_DIR}")
    raise FileNotFoundError(
        "Could not locate dataset CSV. Place train.csv in data/quora/ or data/ "
        "or pass --data_csv /path/to/train.csv. A train.csv.zip in those locations is also supported."
    )

def load_quora_df(csv_path: str, limit_rows: Optional[int] = None) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # Expected columns: id,qid1,qid2,question1,question2,is_duplicate
    df = df.dropna(subset=["question1", "question2", "is_duplicate"]).copy()
    df = df.rename(columns={"is_duplicate": "label"})
    df["label"] = df["label"].astype(int)
    if limit_rows is not None and limit_rows > 0:
        df = df.iloc[:limit_rows].reset_index(drop=True)
    return df[["question1", "question2", "label"]].reset_index(drop=True)

def get_or_create_splits(
    df: pd.DataFrame,
    output_dir: str,
    seed: int = 42,
    test_size: float = 0.1,
    val_size_of_train: float = 0.1111,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    splits_dir = os.path.join(output_dir, "splits")
    train_p = os.path.join(splits_dir, "train.csv")
    val_p = os.path.join(splits_dir, "val.csv")
    test_p = os.path.join(splits_dir, "test.csv")

    if os.path.exists(train_p) and os.path.exists(val_p) and os.path.exists(test_p):
        print(f"[splits] Loading existing splits from {splits_dir}")
        train_df = pd.read_csv(train_p)
        val_df = pd.read_csv(val_p)
        test_df = pd.read_csv(test_p)
        return train_df, val_df, test_df

    print(f"[splits] Creating new splits and saving to {splits_dir}")
    os.makedirs(splits_dir, exist_ok=True)
    train_temp, test_df = model_selection.train_test_split(
        df, test_size=test_size, random_state=seed, stratify=df["label"]
    )
    train_df, val_df = model_selection.train_test_split(
        train_temp, test_size=val_size_of_train, random_state=seed, stratify=train_temp["label"]
    )
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    train_df.to_csv(train_p, index=False)
    val_df.to_csv(val_p, index=False)
    test_df.to_csv(test_p, index=False)
    return train_df, val_df, test_df

def add_sentence_cols(df: pd.DataFrame, label_float: bool = True) -> pd.DataFrame:
    out = df.copy()
    out["sentence1"] = out["question1"]
    out["sentence2"] = out["question2"]
    out["label"] = out["label"].astype(float if label_float else int)
    return out[["sentence1", "sentence2", "label"]]

def make_contrastive_df(df: pd.DataFrame) -> pd.DataFrame:
    # ContrastiveLoss expects two separate sentence inputs
    return add_sentence_cols(df, label_float=True)

def df_to_st_dataset(df: pd.DataFrame) -> Dataset:
    return Dataset.from_pandas(df.reset_index(drop=True))

def tune_threshold_from_scores(y_true: np.ndarray, scores: np.ndarray) -> float:
    precision, recall, thresholds = precision_recall_curve(y_true, scores)
    f1s = 2 * precision * recall / (precision + recall + 1e-12)
    best_idx = int(np.nanargmax(f1s[:-1])) if len(thresholds) > 0 else 0
    thr = thresholds[best_idx] if len(thresholds) > 0 else 0.5
    return float(thr)

@torch.no_grad()
def evaluate_biencoder_f1(
    model: SentenceTransformer,
    df: pd.DataFrame,
    threshold: Optional[float] = None,
    batch_size: int = 512,
) -> Tuple[float, float]:
    device = model.device if hasattr(model, "device") else get_device()
    sents1 = df["question1"].tolist()
    sents2 = df["question2"].tolist()
    emb1 = model.encode(sents1, batch_size=batch_size, convert_to_tensor=True,
                        normalize_embeddings=True, show_progress_bar=False, device=device)
    emb2 = model.encode(sents2, batch_size=batch_size, convert_to_tensor=True,
                        normalize_embeddings=True, show_progress_bar=False, device=device)
    sims = (emb1 * emb2).sum(dim=1).detach().cpu().numpy()
    y_true = df["label"].to_numpy().astype(int)
    thr = threshold if threshold is not None else tune_threshold_from_scores(y_true, sims)
    y_pred = (sims >= thr).astype(int)
    f1 = f1_score(y_true, y_pred)
    return float(f1), float(thr)

def evaluate_crossencoder_f1(
    model: CrossEncoder,
    df: pd.DataFrame,
    threshold: Optional[float] = None,
    batch_size: int = 256,
) -> Tuple[float, float]:
    pairs = list(zip(df["question1"].tolist(), df["question2"].tolist()))
    scores = np.array(model.predict(pairs, batch_size=batch_size, show_progress_bar=False)).reshape(-1)
    y_true = df["label"].to_numpy().astype(int)
    thr = threshold if threshold is not None else tune_threshold_from_scores(y_true, scores)
    y_pred = (scores >= thr).astype(int)
    f1 = f1_score(y_true, y_pred)
    return float(f1), float(thr)

def safe_write_json(path: str, data: Dict):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)




In [7]:
## Cell 3 — Config (edit paths/models here)
# Path to your Kaggle CSV; leave empty to auto-detect known locations
DATA_CSV = "data/quora_question_pairs/train.csv"  # set to "" to auto-locate
OUTPUT_DIR = "outputs_quora"

# Models
BI_MODEL = "microsoft/xtremedistil-l6-h256-uncased"
CE_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Training/eval
EPOCHS = 1
LEARNING_RATE = 5e-5
TRAIN_BS = 64
EVAL_BS = 256
WARMUP_RATIO = 0.1
MAX_SEQ_LENGTH = 256
SEED = 42
LIMIT_ROWS = 0  # e.g., 20000 for a quick dry-run

# Ensure dirs exist and seed set
os.makedirs(OUTPUT_DIR, exist_ok=True)
set_global_seed(SEED)

# Load data / splits (idempotent)
csv_path = ensure_data_csv(DATA_CSV if DATA_CSV else None)
df_all = load_quora_df(csv_path, limit_rows=LIMIT_ROWS if LIMIT_ROWS > 0 else None)
train_df, val_df, test_df = get_or_create_splits(df_all, OUTPUT_DIR, seed=SEED)
print(train_df.shape, val_df.shape, test_df.shape)


[data] Using provided CSV: data/quora_question_pairs/train.csv
[splits] Loading existing splits from outputs_quora/splits
(323433, 3) (40425, 3) (40429, 3)


In [8]:
## Cell 4 — Experiment 1: Baseline bi-encoder (no fine-tuning)
from sentence_transformers import SentenceTransformer

device = get_device()
model = SentenceTransformer(BI_MODEL, device=device)
model.max_seq_length = MAX_SEQ_LENGTH

f1_val, thr_val = evaluate_biencoder_f1(model, val_df, threshold=None, batch_size=EVAL_BS)
f1_test, thr_test = evaluate_biencoder_f1(model, test_df, threshold=thr_val, batch_size=EVAL_BS)

res = {
    "experiment": "baseline_biencoder",
    "val_f1": f1_val, "val_threshold": thr_val,
    "test_f1": f1_test, "test_threshold": thr_test,
    "model_name": BI_MODEL
}
save_path = os.path.join(OUTPUT_DIR, "baseline_biencoder", "result.json")
safe_write_json(save_path, res)

No sentence-transformers model found with name microsoft/xtremedistil-l6-h256-uncased. Creating a new one with mean pooling.


In [11]:
## Cell 5 — Experiment 2: Bi-encoder (CosineSimilarityLoss)
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

device = get_device()
model = SentenceTransformer(BI_MODEL, device=device)
model.max_seq_length = MAX_SEQ_LENGTH

train_ds = df_to_st_dataset(add_sentence_cols(train_df, label_float=True))
val_ds = df_to_st_dataset(add_sentence_cols(val_df, label_float=True))

loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_df["question1"].tolist(),
    sentences2=val_df["question2"].tolist(),
    scores=val_df["label"].astype(float).tolist(),
    main_similarity=SimilarityFunction.COSINE,
    name="dev-cosine",
)

mp = trainer_amp_args()
out_dir = os.path.join(OUTPUT_DIR, "bi_cosine")
args_tr = SentenceTransformerTrainingArguments(
    output_dir=out_dir,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    warmup_ratio=WARMUP_RATIO,
    fp16=mp["fp16"], bf16=mp["bf16"],
    eval_strategy="epoch", save_strategy="epoch", save_total_limit=2,
    report_to="none", logging_steps=100, load_best_model_at_end=False,
    remove_unused_columns=False, dataloader_num_workers=min(4, os.cpu_count() or 1),
)

trainer = SentenceTransformerTrainer(
    model=model, args=args_tr, train_dataset=train_ds, eval_dataset=val_ds,
    loss=loss, evaluator=evaluator
)
trainer.train()
model.save(out_dir)

# Reload and evaluate
model = SentenceTransformer(out_dir, device=device)
model.max_seq_length = MAX_SEQ_LENGTH
f1_val, thr_val = evaluate_biencoder_f1(model, val_df, threshold=None, batch_size=EVAL_BS)
f1_test, thr_test = evaluate_biencoder_f1(model, test_df, threshold=thr_val, batch_size=EVAL_BS)

res = {
    "experiment": "bi_cosine",
    "val_f1": f1_val, "val_threshold": thr_val,
    "test_f1": f1_test, "test_threshold": thr_test,
    "save_dir": out_dir
}
safe_write_json(os.path.join(out_dir, "result.json"), res)

No sentence-transformers model found with name microsoft/xtremedistil-l6-h256-uncased. Creating a new one with mean pooling.
                                                                     

Epoch,Training Loss,Validation Loss,Dev-cosine Pearson Cosine,Dev-cosine Spearman Cosine
1,0.1417,0.152555,0.639202,0.645726


In [12]:
## Cell 6 — Experiment 3: Bi-encoder (ContrastiveLoss)
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

device = get_device()
model = SentenceTransformer(BI_MODEL, device=device)
model.max_seq_length = MAX_SEQ_LENGTH

# IMPORTANT: two sentence columns for contrastive
train_ds = df_to_st_dataset(make_contrastive_df(train_df))
val_ds = df_to_st_dataset(make_contrastive_df(val_df))

loss = losses.ContrastiveLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_df["question1"].tolist(),
    sentences2=val_df["question2"].tolist(),
    scores=val_df["label"].astype(float).tolist(),
    main_similarity=SimilarityFunction.COSINE,
    name="dev-contrastive",
)

mp = trainer_amp_args()
out_dir = os.path.join(OUTPUT_DIR, "bi_contrastive")
args_tr = SentenceTransformerTrainingArguments(
    output_dir=out_dir,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    warmup_ratio=WARMUP_RATIO,
    fp16=mp["fp16"], bf16=mp["bf16"],
    eval_strategy="epoch", save_strategy="epoch", save_total_limit=2,
    report_to="none", logging_steps=100, load_best_model_at_end=False,
    remove_unused_columns=False, dataloader_num_workers=min(4, os.cpu_count() or 1),
)

# Sanity check to avoid the earlier assertion
assert set(["sentence1", "sentence2", "label"]).issubset(set(train_ds.column_names)), train_ds.column_names

trainer = SentenceTransformerTrainer(
    model=model, args=args_tr, train_dataset=train_ds, eval_dataset=val_ds,
    loss=loss, evaluator=evaluator
)
trainer.train()
model.save(out_dir)

model = SentenceTransformer(out_dir, device=device)
model.max_seq_length = MAX_SEQ_LENGTH
f1_val, thr_val = evaluate_biencoder_f1(model, val_df, threshold=None, batch_size=EVAL_BS)
f1_test, thr_test = evaluate_biencoder_f1(model, test_df, threshold=thr_val, batch_size=EVAL_BS)

res = {
    "experiment": "bi_contrastive",
    "val_f1": f1_val, "val_threshold": thr_val,
    "test_f1": f1_test, "test_threshold": thr_test,
    "save_dir": out_dir
}
safe_write_json(os.path.join(out_dir, "result.json"), res)



No sentence-transformers model found with name microsoft/xtremedistil-l6-h256-uncased. Creating a new one with mean pooling.
                                                                     

Epoch,Training Loss,Validation Loss,Dev-contrastive Pearson Cosine,Dev-contrastive Spearman Cosine
1,0.0164,0.017613,0.602302,0.663451


In [13]:
## Cell 7 — Experiment 4: Bi-encoder (MultipleNegativesRankingLoss)
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

device = get_device()
model = SentenceTransformer(BI_MODEL, device=device)
model.max_seq_length = MAX_SEQ_LENGTH

train_pos = train_df[train_df["label"] == 1].copy()
train_ds = df_to_st_dataset(add_sentence_cols(train_pos, label_float=True))
val_ds = df_to_st_dataset(add_sentence_cols(val_df, label_float=True))

loss = losses.MultipleNegativesRankingLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_df["question1"].tolist(),
    sentences2=val_df["question2"].tolist(),
    scores=val_df["label"].astype(float).tolist(),
    main_similarity=SimilarityFunction.COSINE,
    name="dev-mnrl",
)

mp = trainer_amp_args()
out_dir = os.path.join(OUTPUT_DIR, "bi_mnrl")
args_tr = SentenceTransformerTrainingArguments(
    output_dir=out_dir,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    warmup_ratio=WARMUP_RATIO,
    fp16=mp["fp16"], bf16=mp["bf16"],
    eval_strategy="epoch", save_strategy="epoch", save_total_limit=2,
    report_to="none", logging_steps=100, load_best_model_at_end=False,
    remove_unused_columns=False, dataloader_num_workers=min(4, os.cpu_count() or 1),
)

trainer = SentenceTransformerTrainer(
    model=model, args=args_tr, train_dataset=train_ds, eval_dataset=val_ds,
    loss=loss, evaluator=evaluator
)
trainer.train()
model.save(out_dir)

model = SentenceTransformer(out_dir, device=device)
model.max_seq_length = MAX_SEQ_LENGTH
f1_val, thr_val = evaluate_biencoder_f1(model, val_df, threshold=None, batch_size=EVAL_BS)
f1_test, thr_test = evaluate_biencoder_f1(model, test_df, threshold=thr_val, batch_size=EVAL_BS)

res = {
    "experiment": "bi_mnrl",
    "val_f1": f1_val, "val_threshold": thr_val,
    "test_f1": f1_test, "test_threshold": thr_test,
    "save_dir": out_dir
}
safe_write_json(os.path.join(out_dir, "result.json"), res)


No sentence-transformers model found with name microsoft/xtremedistil-l6-h256-uncased. Creating a new one with mean pooling.
                                                                     

Epoch,Training Loss,Validation Loss,Dev-mnrl Pearson Cosine,Dev-mnrl Spearman Cosine
1,0.1121,1.545256,0.47169,0.530116


In [15]:
# Cell 8 — Experiment 5: Cross-encoder (fixed evaluator)
from torch.utils.data import DataLoader
from sentence_transformers import CrossEncoder, InputExample

# IMPORTANT: use the CrossEncoder evaluator, not the bi-encoder one
try:
    from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
    has_ce_eval = True
except Exception:
    has_ce_eval = False

device = get_device()

train_samples = [InputExample(texts=[r.question1, r.question2], label=float(r.label))
                 for r in train_df.itertuples(index=False)]
val_samples = [InputExample(texts=[r.question1, r.question2], label=float(r.label))
               for r in val_df.itertuples(index=False)]

ce_dir = os.path.join(OUTPUT_DIR, "cross_encoder")
os.makedirs(ce_dir, exist_ok=True)

ce = CrossEncoder(CE_MODEL, num_labels=1, device=device, max_length=MAX_SEQ_LENGTH)

# Build the proper evaluator for CrossEncoder
if has_ce_eval:
    # Preferred path (sentence-transformers >= 2.x)
    try:
        evaluator = CEBinaryClassificationEvaluator.from_input_examples(val_samples, name="val-ce")
    except TypeError:
        # Fallback signature for older versions
        s1 = [t.texts[0] for t in val_samples]
        s2 = [t.texts[1] for t in val_samples]
        y = [float(t.label) for t in val_samples]
        evaluator = CEBinaryClassificationEvaluator(s1, s2, y, name="val-ce")
else:
    evaluator = None  # if import fails, skip in-training eval (we still evaluate after training)

train_loader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=max(8, min(64, TRAIN_BS // 2)),
    num_workers=min(4, os.cpu_count() or 1),
    pin_memory=(device == "cuda"),
)

total_steps = len(train_loader) * int(EPOCHS)
warmup_steps = int(total_steps * float(WARMUP_RATIO))
use_amp = (device == "cuda")

ce.fit(
    train_dataloader=train_loader,
    evaluator=evaluator,  # can be None
    epochs=int(EPOCHS),
    warmup_steps=warmup_steps,           # CrossEncoder.fit expects steps, not ratio
    optimizer_params={"lr": float(LEARNING_RATE)},
    show_progress_bar=True,
    evaluation_steps=max(500, len(train_loader) // 5) if evaluator is not None else 0,
    output_path=ce_dir,
    use_amp=use_amp,
)

# Reload and evaluate
ce = CrossEncoder(ce_dir, num_labels=1, device=device, max_length=MAX_SEQ_LENGTH)
f1_val, thr_val = evaluate_crossencoder_f1(ce, val_df, threshold=None, batch_size=min(256, EVAL_BS))
f1_test, thr_test = evaluate_crossencoder_f1(ce, test_df, threshold=thr_val, batch_size=min(256, EVAL_BS))

res = {
    "experiment": "cross_encoder",
    "val_f1": f1_val, "val_threshold": thr_val,
    "test_f1": f1_test, "test_threshold": thr_test,
    "save_dir": ce_dir
}
safe_write_json(os.path.join(ce_dir, "result.json"), res)
res

Step,Training Loss,Validation Loss,Val-ce Accuracy,Val-ce Accuracy Threshold,Val-ce F1,Val-ce F1 Threshold,Val-ce Precision,Val-ce Recall,Val-ce Average Precision
2021,0.3382,No log,0.862956,0.263354,0.821417,-0.314074,0.767904,0.882948,0.879187
4042,0.3081,No log,0.87666,-0.104631,0.839009,-0.434809,0.792389,0.891457,0.898017
6063,0.2931,No log,0.883686,0.142398,0.849901,-0.127329,0.811658,0.891926,0.908089
8084,0.2785,No log,0.891404,-0.117452,0.858282,-0.360567,0.825363,0.893936,0.917659
10105,0.27,No log,0.890909,-0.269954,0.857097,-0.49682,0.816802,0.901575,0.917374


{'experiment': 'cross_encoder',
 'val_f1': 0.8582824059183017,
 'val_threshold': -0.36038997769355774,
 'test_f1': 0.856766905228024,
 'test_threshold': -0.36038997769355774,
 'save_dir': 'outputs_quora/cross_encoder'}

In [16]:
## Cell 9 — Load summary across completed experiments
import glob, json, os

results = []
for path in glob.glob(os.path.join(OUTPUT_DIR, "*", "result.json")):
    try:
        with open(path) as f:
            results.append(json.load(f))
    except Exception as e:
        print("Failed to read", path, e)

results_sorted = sorted(results, key=lambda x: x["test_f1"], reverse=True)
for r in results_sorted:
    print(f"{r['experiment']:16s}  Test F1: {r['test_f1']:.4f}  (Val F1: {r['val_f1']:.4f}, thr_val={r['val_threshold']:.4f})")

cross_encoder     Test F1: 0.8568  (Val F1: 0.8583, thr_val=-0.3604)
bi_contrastive    Test F1: 0.7690  (Val F1: 0.7703, thr_val=0.7891)
bi_cosine         Test F1: 0.7552  (Val F1: 0.7573, thr_val=0.5904)
bi_mnrl           Test F1: 0.6975  (Val F1: 0.6990, thr_val=0.7489)
baseline_biencoder  Test F1: 0.6043  (Val F1: 0.6041, thr_val=0.9696)
