In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import libraries
import pandas as pd
import numpy as np

# Load datasets
corpus_df = pd.read_csv("/kaggle/input/adm-lt-2024-2025-hackathon-rag/corpus.csv")
train_df = pd.read_csv("/kaggle/input/adm-lt-2024-2025-hackathon-rag/train.csv")
test_df = pd.read_csv("/kaggle/input/adm-lt-2024-2025-hackathon-rag/test.csv")
example_submission_df = pd.read_csv("/kaggle/input/adm-lt-2024-2025-hackathon-rag/example_submission.csv")

# Preview datasets
print("Corpus:", corpus_df.shape)
print(corpus_df.head(2), "\n")

print("Train:", train_df.shape)
print(train_df.head(2), "\n")

print("Test:", test_df.shape)
print(test_df.head(2), "\n")

print("Submission Format:")
print(example_submission_df.head())

# Stage 1: Retriever (Dense Embeddings + FAISS)


http://huggingface.co/pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb

sentence-transformers model based on BioBERT, trained on datasets relevant for natural language inference, semantic similarity, and biomedical QA

MNLI, SNLI: Natural Language Inference

SCINLI, SCITAIL, MEDNLI: Science and Medical entailment datasets

STSB: Sentence-level similarity benchmark

It outputs 768-dimensional embeddings that capture sentence meaning, ideal for semantic search in biomedical texts.



In [None]:
# ---------------------------
# Step 1: Load biomedical passages
# ---------------------------

corpus_texts = corpus_df["passage"].tolist() # actual biomedical text
corpus_ids = corpus_df["id"].tolist() # unique IDs (idk if useful)
# our retrievable "knowledge"

In [None]:
"""
!pip install -q torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 \
  transformers sentence-transformers faiss-cpu
"""
# Run once per session

In [None]:
import torch
print(torch.cuda.is_available())  
# We use GPU to improve performance (timely speaking)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# ---------------------------
# Step 2: Load BioBERT model
# ---------------------------

model_name = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb' # It maps each sentence to a dense vector
tokenizer = AutoTokenizer.from_pretrained(model_name) # convert sentences into token IDs
model = AutoModel.from_pretrained(model_name) # BioBERT model fine-tuned for semantic similarity tasks
model.eval() # turns off dropout etc. for inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Get token-level embeddings (This is BioBERT outputs)

In [None]:
from tqdm import tqdm
import faiss

# ---------------------------
# Step 3: Mean Pooling Function
# ---------------------------

def mean_pooling(model_output, attention_mask):
    """
    Aggregate token embeddings into one 768-dim vector per sentence
    """
    token_embeddings = model_output[0]  # (batch_size, seq_len, hidden_size)
    # Multiplies each token vector by its attention mask (to ignore padding).
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    # Averages valid tokens to get one sentence embedding 
    return torch.sum(token_embeddings * input_mask_expanded, 1) / \
           torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# ---------------------------
# Step 4: Encode Corpus Passages
# ---------------------------# 

batch_size = 32
corpus_embeddings = []

# loop over the corpus in batches of 32 passages to avoid GPU memory overflow
for i in tqdm(range(0, len(corpus_texts), batch_size)):
    batch_texts = corpus_texts[i:i+batch_size]
    # tokenize a batch
    encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=256)
    # move the tokenize batch to GPU
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    # pass input through BioBERT
    with torch.no_grad():
        model_output = model(**encoded_input) # disable gradient computation for speed
    # pool token embeddings to sentence vectors
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    # collect them into a list
    corpus_embeddings.append(sentence_embeddings.cpu().numpy())
    
# stack all batches into a single 2D NumPy array: shape (40181, 768)
corpus_embeddings = np.vstack(corpus_embeddings)

# Normalizes each embedding to unit length (L2 norm = 1), so inner product = cosine similarity
faiss.normalize_L2(corpus_embeddings)

In [None]:
import faiss
# Normalizes each embedding to unit length (L2 norm = 1), so inner product = cosine similarity
faiss.normalize_L2(corpus_embeddings)

In [None]:
# ---------------------------
# Step 5: Create FAISS Index
# ---------------------------

index = faiss.IndexFlatIP(corpus_embeddings.shape[1]) # IndexFlatIP: Flat (non-hierarchical) index using Inner Product (dot product)
# Thanks to normalization, dot product ≈ cosine similarity
index.add(corpus_embeddings) # dadd all embeddings to the index
print(f"FAISS index created with {index.ntotal} vectors.")
# Efficient similarity search

In [None]:
# ---------------------------
# Step 6: Define Retrieval Function
# ---------------------------

def retrieve_top_k(question, k=5):
    """
    These passages are used for answering the question
    """
    # Tokenizes one question
    encoded = tokenizer([question], return_tensors='pt', padding=True, truncation=True, max_length=256)
    # and send it ot GPU
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        output = model(**encoded)
    # Gets the sentence embedding for the question
    q_embed = mean_pooling(output, encoded['attention_mask'])
    # Converts to NumPy and normalizes so it’s comparable with FAISS index
    q_embed = q_embed.cpu().numpy()
    faiss.normalize_L2(q_embed)
    # FAISS retrieves top-k most similar vectors from the index
    scores, indices = index.search(q_embed, k)
    top_passages = [corpus_texts[i] for i in indices[0]]
    return top_passages


https://huggingface.co/ktrapeznikov/biobert_v1.1_pubmed_squad_v2

BioBERT model fine-tuned on SQuAD v2, designed for extractive QA in biomedical domains

pretrained on PubMed abstracts

Fine-tuned on SQuAD v2 — includes unanswerable questions logic

F1 ~79 on SQuAD v2, suitable for biomedical-style QA

In [None]:
from transformers import AutoModelForQuestionAnswering

# ---------------------------
# Step 7: QA model 
# ---------------------------

qa_model_name = "ktrapeznikov/biobert_v1.1_pubmed_squad_v2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name).to(device)
qa_model.eval()


In [None]:
# ---------------------------
# Step 8: Generate Answer from Retrieved Passages
# ---------------------------

def generate_answer(question, contexts):
    best_answer = ""
    best_score = -float("inf")
    # Loop through the top-k retrieved contexts
    for context in contexts:
        # Prepares input for QA model as question-context pair
        inputs = qa_tokenizer( 
            question,
            context,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512,
        ).to(device)

        with torch.no_grad():
            outputs = qa_model(**inputs) # Predicts scores for all possible start and end positions

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Get the most likely  start/end positions
        start_idx = torch.argmax(start_logits) 
        end_idx = torch.argmax(end_logits)

        # Validate answer span
        if start_idx <= end_idx:
            answer_tokens = inputs["input_ids"][0][start_idx : end_idx + 1]
            answer = qa_tokenizer.decode(answer_tokens, skip_special_tokens=True) # Converts the token IDs to an actual string
            score = start_logits[0, start_idx] + end_logits[0, end_idx] # sums logits (confidence) for the answer span

            # Keeps only the best answer found so far
            if score > best_score and answer.strip():
                best_answer = answer.strip()
                best_score = score

    return best_answer


In [None]:
# ---------------------------
# Step 9: Run on Test Set
# ---------------------------

predictions = []

# Iterates through every test question
for question in tqdm(test_df["question"]):
    retrieved_contexts = retrieve_top_k(question, k=5) # Retrieves the top-5 most similar passages using FAISS + BioBERT embeddings
    answer = generate_answer(question, retrieved_contexts) # Uses the QA model (BioBERT-SQuAD) to extract the most likely answer from those passages
    predictions.append(answer)




In [None]:
# ---------------------------
# Step 10: Generate submission
# ---------------------------

example_submission_df["answer"] = predictions
example_submission_df.to_csv("submission2.csv", index=False)

print(example_submission_df.head())


In [None]:
for q, a in zip(test_df["question"].head(10), predictions[:10]):
    print(f"\nQuestion: {q}\nAnswer: {a}")


In [None]:
missing = 0
total = 0
f1_total = 0

for q, a in zip(test_df["question"], predictions):
    
    total += 1
    if not a.strip():  # risposta vuota
        missing += 1
        f1_total += 0.0
    else:  # risposta presente, assumiamo perfetta
        f1_total += 1.0

# Riassunto
print("\n--- STATISTICHE ---")
print(f"Totale domande valutate: {total}")
print(f"Risposte mancanti: {missing}")
print(f"Risposte presenti: {total - missing}")

By default the model we used returns empty answer when the score of its response is under a certain threshold (so he is not sure about it). The issue was that the Kaggle submission set up didn't allowed the used of any empty space so replaced the empty answer with a placeholder, to allow us to submit our predictions

In [None]:
# Fill in missing answers with a default string
example_submission_df["answer"] = example_submission_df["answer"].fillna("No")
example_submission_df.loc[example_submission_df["answer"].str.strip() == "", "answer"] = "No"


In [None]:
submission_path = "/kaggle/working/submission_na.csv"
example_submission_df.to_csv(submission_path, index=False)

print("Submission file created:", submission_path)