In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from sentence_transformers import SentenceTransformer, util as st_util
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm


SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)

Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

SEED = 42


faq = pd.read_excel("/content/drive/MyDrive/NLPfaq/university_faq_expanded_dataset_v3.xlsx")
faq = faq.dropna(subset=["question", "answer"]).reset_index(drop=True)

squad = load_dataset("squad")
print("dataset loaded")
df_squad = pd.DataFrame({
    "university": "squad",
    "category": "general",
    "question": squad["train"]["question"],
    "answer": [a["text"][0] for a in squad["train"]["answers"]],
})


df_squad = df_squad.sample(2000, random_state=SEED)

combined = pd.concat([faq, df_squad], ignore_index=True)
combined = combined.sample(frac=1.0, random_state=SEED).reset_index(drop=True)


stage1_train_df, stage1_temp_df = train_test_split(combined, test_size=0.30, random_state=SEED)
stage1_val_df, stage1_test_df = train_test_split(stage1_temp_df, test_size=0.50, random_state=SEED)

faq_train_df, faq_temp_df = train_test_split(faq, test_size=0.30, random_state=SEED)
faq_val_df, faq_test_df = train_test_split(faq_temp_df, test_size=0.50, random_state=SEED)

print("Stage1 train:", len(stage1_train_df), "Stage2 train FAQ:", len(faq_train_df))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

dataset loaded
Stage1 train: 1610 Stage2 train FAQ: 210


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
import pandas as pd


squad = load_dataset("squad")
print("dataset loaded")
df_squad = pd.DataFrame({
    "university": "squad",
    "category": "general",
    "question": squad["train"]["question"],
    "answer": [a["text"][0] for a in squad["train"]["answers"]],
})

df_squad = df_squad.sample(2000, random_state=42)
print(f"squad sample count: {len(df_squad)}")
df_squad.head()

dataset loaded
squad sample count: 2000


Unnamed: 0,university,category,question,answer
9983,squad,general,What year was the Banská Akadémia founded?,1735
43267,squad,general,What is another speed that can also be reporte...,SOS-based speed
81021,squad,general,Where were the use of advanced materials and t...,Sumerian temples and palaces
49374,squad,general,Who is elected every even numbered year?,mayor
53414,squad,general,What was the purpose of top secret ICBM commit...,decide on the feasibility of building an ICBM ...


In [None]:
faq = pd.read_excel("/content/drive/MyDrive/NLPfaq/university_faq_expanded_dataset_v3.xlsx")
faq = faq.dropna(subset=["question", "answer"]).reset_index(drop=True)

combined = pd.concat([faq, df_squad], ignore_index=True)
print(f"\nTotal combined dataset: {len(combined)} QA pairs")
print(f"  - FAQ data: {len(faq)}")
print(f"  - SQuAD data: {len(df_squad)}")

combined.to_excel("/content/drive/MyDrive/NLPfaq/university_faq_with_squad_large.xlsx", index=False)
print("\nSaved combined dataset")


Total combined dataset: 2300 QA pairs
  - FAQ data: 300
  - SQuAD data: 2000

Saved combined dataset


In [None]:
df = combined.copy()

print("Total QA pairs:", len(df))
print("\nColumns:", df.columns.tolist())
print("\nSample rows:")
display(df.head())

Total QA pairs: 2300

Columns: ['id', 'university', 'category', 'question', 'answer', 'notes']

Sample rows:


Unnamed: 0,id,university,category,question,answer,notes
0,1.0,Georgia Tech,admissions,What are the application deadlines?,"Deadlines vary by program, but undergraduate a...",
1,2.0,Georgia Tech,admissions,Do you superscore standardized test scores?,"Yes, we consider the highest section scores ac...",
2,3.0,Georgia Tech,admissions,Are letters of recommendation required?,They are optional for undergraduates but requi...,
3,4.0,Georgia Tech,admissions,Can I apply as an undeclared major?,"Yes, students may apply without declaring a ma...",
4,5.0,Georgia Tech,admissions,What is the average admitted GPA?,Most admitted students have strong academic re...,


In [None]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

print(f"Train size: {len(train_df)}")
print(f"Val size:   {len(val_df)}")
print(f"Test size:  {len(test_df)}\n")

if "university" in df.columns:
    print("Universities in dataset:", df["university"].unique())
    print("\nCounts by university:")
    display(df["university"].value_counts())

if "category" in df.columns:
    print("\nCounts by category:")
    display(df["category"].value_counts())

Train size: 1610
Val size:   345
Test size:  345

Universities in dataset: ['Georgia Tech' 'UC Berkeley' 'UCLA' 'Harvard University'
 'Stanford University' 'MIT' 'Princeton University'
 'Carnegie Mellon University' 'University of Michigan'
 'University of Illinois Urbana-Champaign' 'UT Austin' 'NYU'
 'Columbia University' 'Duke University' 'Yale University'
 'Cornell University' 'University of Chicago' 'Purdue University'
 'University of Southern California' 'Boston University'
 'Penn State University' 'University of Washington' 'Virginia Tech'
 'squad']

Counts by university:


Unnamed: 0_level_0,count
university,Unnamed: 1_level_1
squad,2000
Georgia Tech,20
UCLA,20
UC Berkeley,20
Stanford University,12
MIT,12
Princeton University,12
Harvard University,12
Carnegie Mellon University,12
University of Michigan,12



Counts by category:


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
general,2000
admissions,75
registration,75
housing,75
financial_aid,75


In [None]:
import re

def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def exact_match(pred: str, gold: str) -> float:
    return float(normalize_text(pred) == normalize_text(gold))

def f1_score_str(pred: str, gold: str) -> float:
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    if len(pred_tokens) == 0 and len(gold_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0

    common = {}
    for t in gold_tokens:
        common[t] = common.get(t, 0) + 1

    num_same = 0
    for t in pred_tokens:
        if common.get(t, 0) > 0:
            num_same += 1
            common[t] -= 1

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

In [None]:

baseline_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)


all_answers = df["answer"].tolist()
answer_embeddings = baseline_model.encode(
    all_answers, convert_to_tensor=True, show_progress_bar=True
)

def evaluate_baseline(test_df, all_answers, answer_embeddings, model):
    em_scores = []
    f1_scores = []
    correct = 0
    total = 0

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        q = row["question"]
        gold_answer = row["answer"]

        q_emb = model.encode(q, convert_to_tensor=True)
        scores = st_util.cos_sim(q_emb, answer_embeddings)[0]
        top_idx = int(torch.argmax(scores).cpu().item())
        pred_answer = all_answers[top_idx]

        em_scores.append(exact_match(pred_answer, gold_answer))
        f1_scores.append(f1_score_str(pred_answer, gold_answer))
        correct += exact_match(pred_answer, gold_answer)
        total += 1

    em = float(np.mean(em_scores))
    f1 = float(np.mean(f1_scores))
    acc = correct / total if total > 0 else 0.0
    return em, f1, acc

baseline_em, baseline_f1, baseline_acc = evaluate_baseline(
    test_df, all_answers, answer_embeddings, baseline_model
)

print("Embedding Retrieval Baseline - all-MiniLM-L6-v2")
print(f"Exact Match: {baseline_em:.4f}")
print(f"F1:          {baseline_f1:.4f}")
print(f"Top-1 Acc:   {baseline_acc:.4f}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/345 [00:00<?, ?it/s]

Embedding Retrieval Baseline - all-MiniLM-L6-v2
Exact Match: 0.2145
F1:          0.2473
Top-1 Acc:   0.2145


In [None]:
# Dataset for Bi-Encoder with improved approach

class BiEncoderDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.questions = df['question'].tolist()
        self.answers = df['answer'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        # Encode question
        q_encoding = self.tokenizer(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Encode answer
        a_encoding = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'question_input_ids': q_encoding['input_ids'].squeeze(),
            'question_attention_mask': q_encoding['attention_mask'].squeeze(),
            'answer_input_ids': a_encoding['input_ids'].squeeze(),
            'answer_attention_mask': a_encoding['attention_mask'].squeeze(),
        }

print('Done with Bi-encoder class ')


Done with Bi-encoder class 


In [None]:
# Bi-Encoder Model with Mean Pooling (IMPROVED)

class BiEncoderModel(torch.nn.Module):
    def __init__(self, model_name='distilbert-base-uncased'):
        super().__init__()
        from transformers import AutoModel
        self.encoder = AutoModel.from_pretrained(model_name)
        self.model_name = model_name

    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return self.mean_pooling(outputs.last_hidden_state, attention_mask)

    def encode_questions(self, input_ids, attention_mask):
        return self.forward(input_ids, attention_mask)

    def encode_answers(self, input_ids, attention_mask):
        return self.forward(input_ids, attention_mask)

def contrastive_loss_with_hard_negatives(question_emb, answer_emb, temperature=0.05):
    question_emb = torch.nn.functional.normalize(question_emb, dim=1)
    answer_emb = torch.nn.functional.normalize(answer_emb, dim=1)

    similarity = torch.matmul(question_emb, answer_emb.T) / temperature


    batch_size = question_emb.shape[0]
    labels = torch.arange(batch_size).to(question_emb.device)

    loss_q2a = torch.nn.functional.cross_entropy(similarity, labels)
    loss_a2q = torch.nn.functional.cross_entropy(similarity.T, labels)
    base_loss = (loss_q2a + loss_a2q) / 2

    mask = torch.eye(batch_size, device=similarity.device).bool()
    neg_similarity = similarity.masked_fill(mask, float('-inf'))


    hard_neg_sim = neg_similarity.max(dim=1)[0]
    pos_sim = similarity.diagonal()

    hard_neg_loss = torch.clamp(hard_neg_sim - pos_sim + 0.2, min=0).mean()


    total_loss = base_loss + 0.1 * hard_neg_loss

    return total_loss

In [None]:
# Train Bi-Encoder
from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm


MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BiEncoderModel(MODEL_NAME).to(device)

train_dataset = BiEncoderDataset(train_df, tokenizer)
val_dataset = BiEncoderDataset(val_df, tokenizer)


batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


num_epochs = 10
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

total_steps = len(train_loader) * num_epochs
num_warmup_steps = len(train_loader)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

print(f'   Training bi-encoder for {num_epochs} epochs')
print(f'   Batch size: {batch_size}')
print(f'   Learning rate: {learning_rate}')
print(f'   Train batches per epoch: {len(train_loader)}')
print(f'   Total gradient updates: {total_steps}')
print(f'   Warmup steps: {num_warmup_steps}\n')

best_val_loss = float('inf')
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    print(f'\n{'='*60}')
    print(f'Epoch {epoch+1}/{num_epochs}')
    print('='*60)

    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc='Training'):
        q_ids = batch['question_input_ids'].to(device)
        q_mask = batch['question_attention_mask'].to(device)
        a_ids = batch['answer_input_ids'].to(device)
        a_mask = batch['answer_attention_mask'].to(device)

        q_emb = model.encode_questions(q_ids, q_mask)
        a_emb = model.encode_answers(a_ids, a_mask)
        loss = contrastive_loss_with_hard_negatives(q_emb, a_emb)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f'Train loss: {avg_train_loss:.4f}')

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            q_ids = batch['question_input_ids'].to(device)
            q_mask = batch['question_attention_mask'].to(device)
            a_ids = batch['answer_input_ids'].to(device)
            a_mask = batch['answer_attention_mask'].to(device)

            q_emb = model.encode_questions(q_ids, q_mask)
            a_emb = model.encode_answers(a_ids, a_mask)
            val_loss += contrastive_loss_with_hard_negatives(q_emb, a_emb).item()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f'Val loss:   {avg_val_loss:.4f}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), '/content/distilbert_biencoder_best.pt')
        print('Saved best model')

    if epoch > 3 and avg_val_loss > val_losses[-2]:
        print('⚠️  Val loss increased - possible overfitting')

print(f'\n{'='*60}')
print(f'Training complete!')
print(f'   Best val loss: {best_val_loss:.4f}')
print(f'   Final train loss: {train_losses[-1]:.4f}')
print('='*60)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

   Training bi-encoder for 10 epochs
   Batch size: 64
   Learning rate: 5e-05
   Train batches per epoch: 26
   Total gradient updates: 260
   Warmup steps: 26


Epoch 1/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 2.6606


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.5811
Saved best model

Epoch 2/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 1.2264


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.1673
Saved best model

Epoch 3/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.5463


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.1212
Saved best model

Epoch 4/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.2771


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.1386

Epoch 5/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.1652


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.2172
⚠️  Val loss increased - possible overfitting

Epoch 6/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.1273


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.1868

Epoch 7/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.0906


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.1985
⚠️  Val loss increased - possible overfitting

Epoch 8/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.0677


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.2127
⚠️  Val loss increased - possible overfitting

Epoch 9/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.0632


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.2129
⚠️  Val loss increased - possible overfitting

Epoch 10/10


Training:   0%|          | 0/26 [00:00<?, ?it/s]

Train loss: 0.0667


Validation:   0%|          | 0/6 [00:00<?, ?it/s]

Val loss:   1.2121

Training complete!
   Best val loss: 1.1212
   Final train loss: 0.0667


In [None]:
model.load_state_dict(torch.load('/content/distilbert_biencoder_best.pt'))
model.eval()


# Get training data
train_questions = train_df['question'].tolist()
train_answers = train_df['answer'].tolist()

all_answer_embeddings = []
with torch.no_grad():
    for i in tqdm(range(0, len(train_answers), 32), desc='Encoding answers'):
        batch_answers = train_answers[i:i+32]
        enc = tokenizer(
            batch_answers,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        answer_emb = model.encode_answers(
            enc['input_ids'].to(device),
            enc['attention_mask'].to(device)
        )
        all_answer_embeddings.append(answer_emb.cpu())

all_answer_embeddings = torch.cat(all_answer_embeddings, dim=0)
print(f'   Index contains {len(train_answers)} answer embeddings\n')

def retrieve_biencoder(query, top_k=1):

    with torch.no_grad():
        enc = tokenizer(
            query,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        q_emb = model.encode_questions(
            enc['input_ids'].to(device),
            enc['attention_mask'].to(device)
        )

        q_emb = torch.nn.functional.normalize(q_emb, dim=1)
        answer_emb = torch.nn.functional.normalize(all_answer_embeddings.to(device), dim=1)

        similarities = torch.matmul(q_emb, answer_emb.T).squeeze()
        top_k = min(top_k, len(train_answers))
        top_scores, top_indices = torch.topk(similarities, k=top_k)

        results = []
        for idx, score in zip(top_indices.cpu().numpy(), top_scores.cpu().numpy()):
            results.append({
                'question': train_questions[idx],
                'answer': train_answers[idx],
                'score': float(score)
            })

        return results

print('Testing retrieval...')
test_question = test_df.iloc[0]['question']
test_answer = test_df.iloc[0]['answer']
results = retrieve_biencoder(test_question, top_k=3)

print(f'\nTest Question: {test_question}')
print(f'Gold Answer: {test_answer}\n')
print('Top 3 Retrieved:')
for i, r in enumerate(results, 1):
    print(f'  {i}. Score: {r["score"]:.3f}')
    print(f'     Answer: {r["answer"][:150]}...')
    print()

Encoding answers:   0%|          | 0/51 [00:00<?, ?it/s]

   Index contains 1610 answer embeddings

Testing retrieval...

Test Question: How do I register for classes?
Gold Answer: Students register using the online registration portal available through their student account.

Top 3 Retrieved:
  1. Score: 0.805
     Answer: Students register using the online registration portal available through their student account....

  2. Score: 0.667
     Answer: Time tickets indicate when students may begin the registration process....

  3. Score: 0.667
     Answer: Time tickets indicate when students may begin the registration process....



In [None]:
#evaluate test set
print('Evaluating on test set...')
print(f'Test set size: {len(test_df)}\n')

biencoder_em = 0
biencoder_f1 = 0
biencoder_acc = 0

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Evaluating'):
    results = retrieve_biencoder(row['question'], top_k=1)
    pred = results[0]['answer'] if results else ''
    gold = row['answer']

    biencoder_em += exact_match(pred, gold)
    biencoder_f1 += f1_score_str(pred, gold)
    biencoder_acc += exact_match(pred, gold)

biencoder_em /= len(test_df)
biencoder_f1 /= len(test_df)
biencoder_acc /= len(test_df)

print('DistilBERT Bi-Encoder Results')
print(f'Exact Match: {biencoder_em:.4f} ({biencoder_em*100:.2f}%)')
print(f'F1 Score:    {biencoder_f1:.4f} ({biencoder_f1*100:.2f}%)')
print(f'Top-1 Acc:   {biencoder_acc:.4f} ({biencoder_acc*100:.2f}%)')

Evaluating on test set...
Test set size: 345



Evaluating:   0%|          | 0/345 [00:00<?, ?it/s]

DistilBERT Bi-Encoder Results
Exact Match: 0.0319 (3.19%)
F1 Score:    0.1216 (12.16%)
Top-1 Acc:   0.0319 (3.19%)


In [None]:
results_df = pd.DataFrame([
    {
        'Model': 'Baseline (SentenceTransformer)',
        'Exact Match': baseline_em,
        'F1': baseline_f1,
        'Top-1 Acc': baseline_acc
    },
    {
        'Model': 'Fine-tuned Bi-Encoder',
        'Exact Match': biencoder_em,
        'F1': biencoder_f1,
        'Top-1 Acc': biencoder_acc
    }
])


print('FINAL RESULTS COMPARISON')

print(results_df.to_string(index=False))


styled = results_df.style.format({
    'Exact Match': '{:.2%}',
    'F1': '{:.2%}',
    'Top-1 Acc': '{:.2%}'
}).background_gradient(subset=['Exact Match', 'F1', 'Top-1 Acc'], cmap='RdYlGn')

display(styled)

em_improvement = biencoder_em - baseline_em
f1_improvement = biencoder_f1 - baseline_f1
acc_improvement = biencoder_acc - baseline_acc

print(f'\nPerformance Results:')
print(f'  Exact Match improvement: {em_improvement:+.2%} ({em_improvement*100:+.2f} points)')
print(f'  F1 improvement:          {f1_improvement:+.2%} ({f1_improvement*100:+.2f} points)')
print(f'  Accuracy improvement:    {acc_improvement:+.2%} ({acc_improvement*100:+.2f} points)')

if biencoder_em > baseline_em:
    improvement_pct = (biencoder_em - baseline_em) / baseline_em * 100
    print(f'\n Fine-tuning improved retrieval by {improvement_pct:.1f}%')
elif biencoder_em < baseline_em:
    print('\nBaseline still outperforms fine-tuned model')
else:
    print('\nSimilar performance - fine-tuning matched baseline')

FINAL RESULTS COMPARISON
                         Model  Exact Match       F1  Top-1 Acc
Baseline (SentenceTransformer)     0.214493 0.247326   0.214493
         Fine-tuned Bi-Encoder     0.031884 0.121550   0.031884


Unnamed: 0,Model,Exact Match,F1,Top-1 Acc
0,Baseline (SentenceTransformer),21.45%,24.73%,21.45%
1,Fine-tuned Bi-Encoder,3.19%,12.16%,3.19%



Performance Results:
  Exact Match improvement: -18.26% (-18.26 points)
  F1 improvement:          -12.58% (-12.58 points)
  Accuracy improvement:    -18.26% (-18.26 points)

Baseline still outperforms fine-tuned model
