In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Cell 1/11 - Install & imports
!pip install -q datasets sentence-transformers faiss-cpu transformers nltk accelerate keybert[gensim] sentencepiece scikit-learn rank-bm25 evaluate bert-score rouge-score sacremoses

import os, sys, time, json, pickle, warnings, re
warnings.filterwarnings("ignore")

import numpy as np
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, CrossEncoder, util
from transformers import AutoTokenizer, AutoModelForCausalLM
from keybert import KeyBERT
import faiss
from rank_bm25 import BM25Okapi

# Evaluation libs
from evaluate import load as eval_load

# quick eval metrics
bleu_metric = eval_load("bleu")
rouge_metric = eval_load("rouge")
bertscore_metric = eval_load("bertscore")

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def log(msg): print(f"[{time.strftime('%H:%M:%S')}] {msg}")
log(f"Device: {device} | CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    try:
        print("GPU:", torch.cuda.get_device_name(0))
    except Exception:
        pass


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4

2025-09-20 16:54:42.040204: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758387282.233695      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758387282.298115      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[16:54:58] Device: cuda | CUDA available: True
GPU: Tesla T4


In [3]:
# Cell 2/11 - Load dataset and convert to QA pairs
log("Loading dataset and converting to QA pairs...")

# CHANGE THIS if you want a different dataset:7
DATASET_NAME = "altaidevorg/women-health-mini"
SPLIT = "train"

ds = load_dataset(DATASET_NAME, split=SPLIT)

qa_data = []
for row in ds:
    # many datasets store dialogues under different keys; handle robustly
    conv = None
    if isinstance(row, dict):
        for k in ["conversations", "conversation", "dialog", "dialogue", "messages"]:
            if k in row:
                conv = row[k]
                break
    if not conv:
        # skip rows that don't contain dialogue-like content
        continue

    # conv is often a list of dicts or list of strings
    user_msgs = []
    assistant_msgs = []

    if isinstance(conv, list) and len(conv) > 0:
        if isinstance(conv[0], dict):
            # common shapes:
            #  - {"from": "human", "value": "..."}
            #  - {"role": "user", "content": "..."}
            if "from" in conv[0] and "value" in conv[0]:
                user_msgs = [m["value"] for m in conv if m.get("from") in ("human", "user")]
                assistant_msgs = [m["value"] for m in conv if m.get("from") in ("assistant", "bot", "system")]
            elif "role" in conv[0] and "content" in conv[0]:
                user_msgs = [m["content"] for m in conv if m.get("role") in ("user", "human")]
                assistant_msgs = [m["content"] for m in conv if m.get("role") in ("assistant", "bot")]
            else:
                # fallback: find any textual fields
                texts = []
                for m in conv:
                    if isinstance(m, dict):
                        # pick first textual field
                        for field in ["value", "content", "text", "utterance"]:
                            if field in m and isinstance(m[field], str):
                                texts.append(m[field])
                                break
                    elif isinstance(m, str):
                        texts.append(m)
                if len(texts) >= 2:
                    user_msgs = [texts[0]]
                    assistant_msgs = texts[1:]
        else:
            # conv is a list of strings
            if len(conv) >= 2:
                user_msgs = [conv[0]]
                assistant_msgs = conv[1:]

    if user_msgs and assistant_msgs:
        question = " ".join(user_msgs).strip()
        answer = " ".join(assistant_msgs).strip()
        if question and answer:
            qa_data.append({"question": question, "answer": answer})

if len(qa_data) == 0:
    raise ValueError("No QA pairs extracted from dataset. Inspect dataset structure or change DATASET_NAME.")

train_data, test_data = train_test_split(qa_data, test_size=0.2, random_state=42)
log(f"Prepared QA dataset: {len(train_data)} train, {len(test_data)} test")
print("Sample QA pair (train):", train_data[0])


[16:54:58] Loading dataset and converting to QA pairs...


README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


women-health-mini.jsonl:   0%|          | 0.00/35.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10348 [00:00<?, ? examples/s]

[16:55:01] Prepared QA dataset: 8278 train, 2070 test
Sample QA pair (train): {'question': 'What are some natural ways to manage PMS symptoms?', 'answer': "Many women experience premenstrual syndrome (PMS), a collection of physical and emotional symptoms that occur in the days leading up to menstruation. While the severity varies greatly,  managing PMS naturally is possible for many, focusing on lifestyle changes and dietary adjustments.  Here's what I recommend, emphasizing evidence-based approaches:\n\n\n**Dietary Changes:**\n\n* **Reduce Refined Carbohydrates and Sugar:**  These can worsen mood swings and bloating.  Opt for complex carbohydrates like whole grains, fruits, and vegetables for sustained energy release and better blood sugar control.\n* **Increase Magnesium Intake:** Magnesium plays a crucial role in regulating mood and reducing muscle cramps.  Good sources include dark leafy greens, nuts, seeds, and legumes. Consider a magnesium supplement, but always consult with me f

In [4]:
# Cell 3/11 - Chunking helpers
def create_sentence_chunks(data, window_size=3, stride=1, min_chars=40):
    chunks = []
    for idx, item in enumerate(data):
        text = item.get("answer", "") or item.get("text", "")
        if not isinstance(text, str):
            continue
        sentences = sent_tokenize(text)
        if not sentences:
            if len(text) > min_chars:
                chunks.append({"chunk": text, "source_idx": idx})
            continue
        if len(sentences) <= window_size:
            chunks.append({"chunk": " ".join(sentences), "source_idx": idx})
            continue
        for i in range(0, max(1, len(sentences) - window_size + 1), stride):
            chunk = " ".join(sentences[i:i+window_size])
            chunks.append({"chunk": chunk, "source_idx": idx})
    return chunks

from transformers import AutoTokenizer
# token-chunking helper (optional)
def create_token_chunks(data, tokenizer, chunk_tokens=400, overlap=100, field="answer"):
    chunks = []
    for idx, item in enumerate(data):
        text = item.get(field, "") or item.get("text", "")
        if not isinstance(text, str) or len(text.strip())==0:
            continue
        tokens = tokenizer.encode(text, add_special_tokens=False)
        if not tokens:
            continue
        start = 0
        while start < len(tokens):
            end = min(start + chunk_tokens, len(tokens))
            piece = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
            if piece and piece.strip():
                chunks.append({"chunk": piece, "source_idx": idx})
            if end == len(tokens):
                break
            start = max(0, end - overlap)
    return chunks

log("Chunking helpers ready.")


[16:55:01] Chunking helpers ready.


In [5]:
# Cell 4/11 - Load models (embedding + reranker + HyDE + generator)
log("Loading models (this can take time)...")

# Recommended biomedical sentence-transformer (semantic similarity = better completeness)
EMBED_MODEL = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"  # domain-specific SBERT-style model
# Fallback to fast model if the above fails
FALLBACK_EMBED = "sentence-transformers/all-MiniLM-L6-v2"

try:
    embedder = SentenceTransformer(EMBED_MODEL, device=device)
    log(f"Loaded biomedical embedder: {EMBED_MODEL}")
except Exception as e:
    log(f"Could not load {EMBED_MODEL}: {e}. Falling back to {FALLBACK_EMBED}")
    embedder = SentenceTransformer(FALLBACK_EMBED, device=device)

# Cross-encoder reranker
CE_MODEL = "cross-encoder/ms-marco-MiniLM-L6-v2"
reranker = CrossEncoder(CE_MODEL, device=device)
log(f"Cross-encoder loaded: {CE_MODEL}")

# HyDE model (use BioGPT-Large for domain-grounded HyDE)
HYDE_MODEL = "microsoft/BioGPT-Large"
try:
    hyde_tokenizer = AutoTokenizer.from_pretrained(HYDE_MODEL)
    if hyde_tokenizer.pad_token is None:
        hyde_tokenizer.pad_token = hyde_tokenizer.eos_token
    hyde_model = AutoModelForCausalLM.from_pretrained(HYDE_MODEL).to(device)
    log(f"HyDE model loaded: {HYDE_MODEL}")
except Exception as e:
    log(f"Failed to load HyDE {HYDE_MODEL}: {e}. Falling back to gpt2")
    HYDE_MODEL = "gpt2"
    hyde_tokenizer = AutoTokenizer.from_pretrained(HYDE_MODEL)
    if hyde_tokenizer.pad_token is None:
        hyde_tokenizer.pad_token = hyde_tokenizer.eos_token
    hyde_model = AutoModelForCausalLM.from_pretrained(HYDE_MODEL).to(device)
    log("HyDE fallback gpt2 loaded.")

# Generation model: prefer Mistral-7B-Instruct (heavy). Provide fallback to BioGPT-Large for smaller GPU.
GEN_MODEL_PREFERRED = "microsoft/BioGPT-Large"

try:
    gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_PREFERRED)
    if gen_tokenizer.pad_token is None:
        gen_tokenizer.pad_token = gen_tokenizer.eos_token
    gen_model = AutoModelForCausalLM.from_pretrained(GEN_MODEL_PREFERRED).to(device)
    log(f"Generator loaded: {GEN_MODEL_PREFERRED}")
except Exception as e:
    log(f"Could not load {GEN_MODEL_PREFERRED}: {e}. Falling back to {GEN_MODEL_FALLBACK}")
    GEN_MODEL_PREFERRED = GEN_MODEL_FALLBACK
    gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_PREFERRED)
    if gen_tokenizer.pad_token is None:
        gen_tokenizer.pad_token = gen_tokenizer.eos_token
    gen_model = AutoModelForCausalLM.from_pretrained(GEN_MODEL_PREFERRED).to(device)
    log(f"Generator fallback loaded: {GEN_MODEL_PREFERRED}")

# KeyBERT for keyword extraction (lightweight)
kw_model = KeyBERT(model=embedder)
STOPWORDS = set(stopwords.words('english'))

log("All models loaded (or fell back).")


[16:55:01] Loading models (this can take time)...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[16:55:09] Loaded biomedical embedder: pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

[16:55:11] Cross-encoder loaded: cross-encoder/ms-marco-MiniLM-L6-v2


tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.29G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.28G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

[16:57:04] HyDE model loaded: microsoft/BioGPT-Large
[16:57:13] Generator loaded: microsoft/BioGPT-Large
[16:57:13] All models loaded (or fell back).


In [6]:
# Cell 5/11 - Build chunks, compute embeddings, build FAISS + BM25; save index & id2doc
log("Creating sentence chunks from training data...")
sentence_chunks = create_sentence_chunks(train_data, window_size=3, stride=1)
id2doc = [c["chunk"] for c in sentence_chunks]
log(f"Created {len(id2doc)} chunks")

# Compute embeddings (may take time)
log("Computing embeddings for all chunks...")
chunk_embeddings = embedder.encode(id2doc, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True, batch_size=64).astype("float32")

# FAISS index
dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(chunk_embeddings)
log(f"FAISS index built: {index.ntotal} vectors, dim={dim}")

# Save index and id2doc to disk (so you don't need to reindex)
faiss.write_index(index, "faiss.index")
with open("id2doc.pkl", "wb") as f:
    pickle.dump(id2doc, f)
log("Saved FAISS index (faiss.index) and id2doc.pkl")

# BM25
bm25_corpus = [word_tokenize(txt.lower()) for txt in id2doc]
bm25 = BM25Okapi(bm25_corpus)
log("BM25 built.")


[16:57:13] Creating sentence chunks from training data...
[16:57:17] Created 222376 chunks
[16:57:17] Computing embeddings for all chunks...


Batches:   0%|          | 0/3475 [00:00<?, ?it/s]

[17:17:28] FAISS index built: 222376 vectors, dim=768
[17:17:28] Saved FAISS index (faiss.index) and id2doc.pkl
[17:18:42] BM25 built.


In [7]:
# Cell 6/11 - Retrieval: HyDE + FAISS + BM25 + CrossEncoder rerank

def get_hyde_answer(query, max_length=100):
    prompt = f"Question: {query}\nAnswer:"
    inputs = hyde_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = hyde_model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + max_length,
            do_sample=False,
            repetition_penalty=1.2,
            pad_token_id=hyde_tokenizer.eos_token_id
        )
    text = hyde_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Clean a bit
    return re.sub(r"\s+", " ", text.split("Answer:")[-1].strip())

def retrieve_with_hyde(query, k=30, hyde_weight=0.5, faiss_alpha=0.6):
    # HyDE pseudo-answer for improved embedding
    hyde_txt = get_hyde_answer(query)
    q_emb = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    h_emb = embedder.encode([hyde_txt], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    merged_emb = (1 - hyde_weight) * q_emb + hyde_weight * h_emb

    D, I = index.search(merged_emb, k)
    faiss_candidates = list(dict.fromkeys(I[0].tolist()))

    tokenized_query = word_tokenize(query.lower())
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top = np.argsort(bm25_scores)[::-1][:k].tolist()

    # Normalize FAISS scores & BM25
    faiss_norm = (D[0] - D[0].min()) / (D[0].ptp() + 1e-9)
    faiss_map = {idx: float(score) for idx, score in zip(I[0].tolist(), faiss_norm.tolist())}
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.ptp() + 1e-9)

    merged = {}
    for idx in set(faiss_candidates + bm25_top):
        merged[idx] = faiss_alpha * faiss_map.get(idx, 0.0) + (1 - faiss_alpha) * float(bm25_norm[idx])
    merged_sorted = sorted(merged.items(), key=lambda x: x[1], reverse=True)[:k]
    candidate_indices = [idx for idx, _ in merged_sorted]
    return candidate_indices, hyde_txt

def rerank_chunks(query, candidate_indices, top_k=5, use_cross_encoder=True):
    texts = [id2doc[i] for i in candidate_indices]
    if use_cross_encoder and reranker:
        pairs = [(query, t) for t in texts]
        with torch.no_grad():
            scores = reranker.predict(pairs)
        order = np.argsort(scores)[::-1][:top_k]
        return [texts[i] for i in order]
    else:
        q_emb = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
        t_emb = embedder.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
        sims = np.dot(t_emb, q_emb[0])
        order = np.argsort(sims)[::-1][:top_k]
        return [texts[i] for i in order]


In [8]:
# Cell 7/11 - Generation (Mistral or fallback) and extract_then_summarize

STRICT_FALLBACK = "Not enough information in the provided context."

def clean_output(text):
    # Remove meta tokens and artifacts and extra whitespace
    text = re.sub(r"(Answer:|Final answer:|<\s*/?\s\w+\s*>|</s>|▃)", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s+", " ", text).strip()
    # remove leading/trailing punctuation
    return text.strip(" \n\r\t\"'")

def generate_answer(query, contexts, max_new_tokens=220):
    # contexts: list of strings (chunks)
    if isinstance(contexts, list):
        context_block = "\n\n---\n".join(contexts)
    else:
        context_block = str(contexts)

    # Clean prompt: do NOT include meta instructions in final answer area
    prompt = f"Context:\n{context_block}\n\nQuestion: {query}\n\nAnswer:"
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        outputs = gen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            repetition_penalty=1.12,
            pad_token_id=gen_tokenizer.eos_token_id
        )
    text = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = clean_output(text.split("Answer:")[-1])
    return cleaned or STRICT_FALLBACK

def extract_then_summarize(query, contexts, top_sents=8, max_new_tokens=220):
    hyde_txt = get_hyde_answer(query)
    sents = []
    for ctx in contexts:
        sents.extend([s for s in sent_tokenize(ctx) if len(s.strip())>5])
    if not sents:
        return STRICT_FALLBACK

    qh_emb = embedder.encode([query, hyde_txt], normalize_embeddings=True, convert_to_tensor=True)
    s_emb = embedder.encode(sents, normalize_embeddings=True, convert_to_tensor=True)

    sim_q = util.cos_sim(s_emb, qh_emb[0]).cpu().numpy().reshape(-1)
    sim_h = util.cos_sim(s_emb, qh_emb[1]).cpu().numpy().reshape(-1)
    sims = 0.7 * sim_q + 0.3 * sim_h
    order = np.argsort(sims)[::-1][:top_sents]
    extract_block = "\n".join([sents[i] for i in order])

    prompt = f"Extracted sentences:\n{extract_block}\n\nQuestion: {query}\n\nAnswer:"
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        outputs = gen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            repetition_penalty=1.12,
            pad_token_id=gen_tokenizer.eos_token_id
        )
    text = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = clean_output(text.split("Answer:")[-1])
    return cleaned or STRICT_FALLBACK


In [9]:
# Cell 8/11 - Evaluation metrics

def extract_keywords(text, top_n=8):
    if not text or len(text.split())<3:
        return []
    kws = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,2), stop_words='english', top_n=top_n)
    return [kw for kw, _ in kws]

def retrieval_relevance_score(query, top_k=10):
    q_emb = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    D, I = index.search(q_emb, top_k)
    scores = np.clip(D[0], -1.0, 1.0)
    return float(((scores + 1.0) / 2.0).mean())

def answer_completeness_score(answer, retrieved_chunks, sim_threshold=0.62):
    # Semantic completeness: how many unique key-facts from retrieved chunks are semantically present in answer
    all_facts = []
    for chunk in retrieved_chunks:
        all_facts.extend(extract_keywords(chunk, top_n=4))
    unique_facts = list(dict.fromkeys([f.lower().strip() for f in all_facts if f]))
    if not unique_facts:
        return 0.0

    # embed each fact and the answer as sentences
    try:
        answer_emb = embedder.encode([answer], normalize_embeddings=True, convert_to_tensor=True)
        fact_embs = embedder.encode(unique_facts, normalize_embeddings=True, convert_to_tensor=True)
        sims = util.cos_sim(answer_emb, fact_embs).cpu().numpy()[0]
        covered = (sims >= sim_threshold).sum()
        return float(covered / max(1, len(unique_facts)))
    except Exception as e:
        log(f"Error in completeness scoring: {e}")
        return 0.0

def faithfulness_score(answer, context_chunks, sentence_threshold=0.5):
    if not answer or not context_chunks:
        return 0.0
    ans_sents = sent_tokenize(answer)
    ctx_sents = []
    for c in context_chunks:
        ctx_sents += sent_tokenize(c)
    if not ans_sents or not ctx_sents:
        return 0.0
    try:
        ans_emb = embedder.encode(ans_sents, normalize_embeddings=True, convert_to_tensor=True)
        ctx_emb = embedder.encode(ctx_sents, normalize_embeddings=True, convert_to_tensor=True)
        sim_mtx = util.cos_sim(ans_emb, ctx_emb).cpu().numpy()
        max_sims = np.max(sim_mtx, axis=1)
        faithful = (max_sims >= sentence_threshold).sum()
        return float(faithful / max(1, len(ans_sents)))
    except Exception as e:
        log(f"Error in faithfulness scoring: {e}")
        return 0.0

# NLG metrics wrapper
def compute_nlg_metrics(pred, ref):
    # expects strings
    try:
        # BLEU expects tokenized list of references and hypothesis tokens
        bleu_res = bleu_metric.compute(predictions=[pred.split()], references=[[ref.split()]])
        rouge_res = rouge_metric.compute(predictions=[pred], references=[ref])
        bert_res = bertscore_metric.compute(predictions=[pred], references=[ref], lang="en")
        return {"bleu": bleu_res.get("bleu", 0.0), "rouge": rouge_res, "bertscore": bert_res}
    except Exception as e:
        log(f"NLG metrics error: {e}")
        return {}


In [10]:
# Cell 9/11 - Full RAG pipeline orchestration

def rag_pipeline_with_metrics(query, retrieve_k=20, rerank_k=5, context_chunks=3, strategy="generate", log_diagnostics=False):
    log(f"Processing query: {query[:120]}...")
    candidate_indices, hyde_txt = retrieve_with_hyde(query, k=retrieve_k)
    reranked_texts = rerank_chunks(query, candidate_indices, top_k=rerank_k, use_cross_encoder=True)
    selected_contexts = reranked_texts[:context_chunks]
    if log_diagnostics:
        print("Selected contexts:")
        for i, c in enumerate(selected_contexts):
            print(i, c[:250].replace("\n", " "), "...")
    # Answer generation
    if strategy == "extract_then_summarize":
        answer = extract_then_summarize(query, selected_contexts, top_sents=8)
    else:
        answer = generate_answer(query, selected_contexts, max_new_tokens=220)

    # Compute metrics
    retrieval_score = retrieval_relevance_score(query, top_k=retrieve_k)
    completeness = answer_completeness_score(answer, selected_contexts)
    faith = faithfulness_score(answer, selected_contexts)
    composite = 0.4 * retrieval_score + 0.3 * completeness + 0.3 * faith

    return {
        "query": query,
        "answer": answer,
        "context": selected_contexts,
        "retrieval_score": retrieval_score,
        "completeness_score": completeness,
        "faithfulness_score": faith,
        "composite_score": composite,
        "hyde_answer": hyde_txt
    }

log("RAG pipeline ready.")


[17:18:42] RAG pipeline ready.


In [11]:
# Cell 10/11 - Quick test function
def test_pipeline(queries=None, strategy="generate", verbose=True):
    if queries is None:
        queries = ["Management of BI-RADS categories (0–6): what each means and recommended actions."]
        ["List key components of the first prenatal visit: labs, ultrasound timing, and risk assessment."]
        ["Define preeclampsia and severe features; outline diagnosis and acute management."]
        ["Screening and management approach for gestational diabetes: timing, tests, and treatment thresholds."]
        
    results = []
    for q in queries:
        try:
            r = rag_pipeline_with_metrics(q, retrieve_k=20, rerank_k=5, context_chunks=3, strategy=strategy, log_diagnostics=True)
            results.append(r)
            if verbose:
                print("\n" + "="*80)
                print("Q:", r["query"])
                print("\nHyDE (short):", r["hyde_answer"][:200])
                print("\nAnswer:\n", r["answer"])
                print("\nScores: Retrieval=%.3f  Completeness=%.3f  Faithfulness=%.3f  Composite=%.3f" % (
                    r["retrieval_score"], r["completeness_score"], r["faithfulness_score"], r["composite_score"]
                ))
        except Exception as e:
            print("Error:", e)
    return results

# run quick test
sample_results = test_pipeline()


[17:18:42] Processing query: Management of BI-RADS categories (0–6): what each means and recommended actions....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected contexts:
0 The availability and usage of gender-responsive WASH (Water, Sanitation, and Hygiene) facilities are crucial for menstrual hygiene management (MHM). Let's break down what that means and the current realities. **What are Gender-Responsive WASH Facilit ...
1 This helps to identify any abnormalities or cancerous cells. **3. Management Strategies:** Once the cause is identified, we can discuss appropriate management options. ...
2 **While awaiting your answers, let's discuss immediate actions:**  * **Keep track of your bleeding. **  Use a menstrual diary to note the amount and duration of your bleeding. This will be invaluable for your assessment and future management. ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Q: Management of BI-RADS categories (0–6): what each means and recommended actions.

HyDE (short): The management of the category 3 lesions is based on the results of the biopsy, while for the other categories it depends on the clinical context. < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREE

Answer:
 It is not possible to distinguish between normal and abnormal findings on ultrasound.

Scores: Retrieval=0.724  Completeness=0.000  Faithfulness=0.000  Composite=0.290


In [12]:
# Cell 11/11 - Evaluate on a small test sample and save outputs
def evaluate_on_testset(test_data, sample_size=20, strategy="generate"):
    log(f"Evaluating on {sample_size} test samples...")
    results = []
    for i, sample in enumerate(test_data[:sample_size]):
        try:
            r = rag_pipeline_with_metrics(sample['question'], strategy=strategy)
            r['ground_truth'] = sample['answer']
            # optional NLG metrics
            r['nlg'] = compute_nlg_metrics(r['answer'], sample['answer'])
            results.append(r)
            if (i+1) % 5 == 0:
                log(f"Processed {i+1}/{sample_size}")
        except Exception as e:
            log(f"Error on sample {i}: {e}")
            continue
    # print summary
    if results:
        print("\n" + "="*80)
        print("EVALUATION RESULTS:")
        print("Average Retrieval Score: %.3f" % np.mean([x['retrieval_score'] for x in results]))
        print("Average Completeness Score: %.3f" % np.mean([x['completeness_score'] for x in results]))
        print("Average Faithfulness Score: %.3f" % np.mean([x['faithfulness_score'] for x in results]))
        print("Average Composite Score: %.3f" % np.mean([x['composite_score'] for x in results]))
        # optionally save to disk
        with open("eval_results.pkl", "wb") as f:
            pickle.dump(results, f)
        log("Saved eval_results.pkl")
    return results

# run evaluation on a small sample
eval_results = evaluate_on_testset(test_data, sample_size=10, strategy="generate")


[17:18:48] Evaluating on 10 test samples...
[17:18:48] Processing query: What are the specific instructions for washing and storing cloth menstrual pads?...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:18:52] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['Do', 'you', 'wash', ..., 'used', 'cloth', 'pads?'],
Input references: [['Washing', 'and', 'storing', 'cloth', 'menstrual', 'pads', 'correctly', 'is', 'crucial', 'for', 'hygiene', 'and', 'preventing', 'infections.', "Here's", 'a', 'detailed', 'guide', 'based', 'on', 'best', 'practices', 'and', 'hygiene', 'standards:', '**1.', 'Pre-Wash', 'Treatment:**', '*', '**Immediately', 'after', 'use:**', 'Rinse', 'your', 'used', 'pad', 'thoroughly', 'under', 'cold', 'running', 'water.', 'This', 'removes', 'the', 'bulk', 'of', 'menstrual', 'blood', 'and', 'prevents', 'staining.', 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:18:56] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ["I'm", '6', '0', ..., 'and', 'night', 'sweats.'],
Input references: [['The', 'age-related', 'changes', 'in', 'your', 'menstrual', 'cycle', 'are', 'significant', 'and', 'occur', 'in', 'predictable', 'stages,', 'primarily', 'influenced', 'by', 'hormonal', 'fluctuations.', "Let's", 'break', 'down', 'the', 'key', 'phases:', '**Puberty', '(Menarche):**', 'The', 'onset', 'of', 'menstruation,', 'typically', 'between', '11', 'and', '15', 'years', 'old,', 'marks', 'the', 'beginning.', 'Initial', 'cycles', 'are', 'often', 'irregular,', 'due', 'to', 'the', 'developing', 'hypothal

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:01] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['Pelvic', 'inflammatory', 'disease', ..., 'and', 'Chlamydia', 'trachomatis.'],
Input references: [['Pelvic', 'Inflammatory', 'Disease', '(PID)', 'is', 'a', 'serious', 'infection', 'of', 'the', 'female', 'reproductive', 'organs.', "It's", 'crucial', 'to', 'understand', 'that', 'PID', "isn't", 'a', 'single', 'disease', 'but', 'rather', 'a', 'spectrum', 'of', 'infections', 'that', 'can', 'involve', 'the', 'uterus,', 'fallopian', 'tubes,', 'and', 'ovaries.', 'The', 'symptoms', 'can', 'vary', 'greatly,', 'and', 'unfortunately,', 'many', 'women', 'experience', 'no', 'symptom

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:13] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['Cervical', 'cancer', 'screening', ..., 'hypopharynx,', 'and', 'nasopharynx.'],
Input references: [["It's", 'commendable', 'that', "you're", 'proactive', 'about', 'your', 'gynecological', 'health.', "Let's", 'discuss', 'your', 'concerns', 'about', 'gynecological', 'cancers', 'and', 'how', 'to', 'mitigate', 'your', 'risk.', 'Several', 'factors', 'influence', 'the', 'risk', 'of', 'developing', 'these', 'cancers,', 'and', 'thankfully,', 'many', 'are', 'modifiable.', 'The', 'most', 'common', 'gynecological', 'cancers', 'include:', '*', '**Cervical', 'cancer:**', 'Human', '

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:23] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['No"', '1', '4.', ..., 'talk', 'to', 'someone'],
Input references: [['Having', 'sex', 'during', 'your', 'period', 'is', 'perfectly', 'safe', 'and', 'can', 'be', 'enjoyable', 'for', 'many', 'couples,', 'but', 'it', 'does', 'require', 'some', 'adjustments', 'and', 'considerations.', "Here's", 'what', 'you', 'should', 'know:', '**Preparation', 'and', 'Comfort:**', '*', '**Hygiene:**', 'This', 'is', 'paramount.', 'Consider', 'showering', 'or', 'washing', 'beforehand', 'to', 'feel', 'fresh', 'and', 'clean.', 'Having', 'some', 'extra', 'towels', 'handy', 'is', 'also', 'a', '

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:26] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['The', 'effects', 'of', ..., 'fertility', 'are', 'temporary.'],
Input references: [['Depo-Provera,', 'or', 'medroxyprogesterone', 'acetate,', 'is', 'a', 'long-acting,', 'injectable', 'contraceptive.', 'Its', 'impact', 'on', 'your', 'menstrual', 'cycle', 'and', 'fertility', 'is', 'significant', 'and', 'predictable,', 'though', 'the', 'experience', 'can', 'vary', 'slightly', 'from', 'woman', 'to', 'woman.', '**Effect', 'on', 'Menstrual', 'Cycle:**', 'Depo-Provera', 'works', 'primarily', 'by', 'suppressing', 'ovulation.', 'This', 'means', 'your', 'ovaries', "don't", 'rele

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:31] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['Do', 'you', 'think', 'exercise', 'will', 'help?'],
Input references: [["It's", 'completely', 'understandable', 'to', 'feel', 'concerned', 'about', 'erratic', 'and', 'heavier', 'periods', 'during', 'perimenopause.', 'This', 'is', 'a', 'common', 'experience', 'as', 'your', 'body', 'transitions', 'towards', 'menopause.', "Let's", 'discuss', 'some', 'ways', 'to', 'manage', 'these', 'symptoms.', 'The', 'hormonal', 'fluctuations', 'characteristic', 'of', 'perimenopause', '–', 'the', 'gradual', 'decline', 'in', 'estrogen', 'and', 'progesterone', '–', 'are', 'the', 'primary',

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:36] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ["It's", 'a', 'very', ..., 'ends', 'with', 'fertilization.'],
Input references: [['The', 'fertile', 'window', 'is', 'the', 'period', 'in', 'a', "woman's", 'menstrual', 'cycle', 'when', 'she', 'is', 'most', 'likely', 'to', 'conceive.', "It's", 'not', 'a', 'precisely', 'defined', 'timeframe', 'like,', 'say,', 'a', 'calendar', 'date,', 'because', 'individual', 'cycles', 'vary.', 'However,', 'we', 'can', 'pinpoint', 'it', 'with', 'reasonable', 'accuracy', 'based', 'on', 'understanding', 'ovulation.', '**Defining', 'the', 'Fertile', 'Window:**', 'The', 'fertile', 'window', '

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:39] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['There', 'are', 'many', ..., 'with', 'hormone', 'therapy.'],
Input references: [['Finding', 'a', 'supportive', 'community', 'can', 'be', 'incredibly', 'beneficial', 'when', 'navigating', 'hormonal', 'issues.', 'There', 'are', 'several', 'avenues', 'you', 'can', 'explore,', 'depending', 'on', 'your', 'preferences', 'and', 'the', 'specifics', 'of', 'your', 'concerns.', '**Online', 'Communities:**', 'The', 'internet', 'offers', 'a', 'wealth', 'of', 'support,', 'offering', 'anonymity', 'and', 'accessibility.', 'Look', 'for', 'online', 'forums', 'or', 'groups', 'dedicated',

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[17:19:50] NLG metrics error: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['No."', '-*', '*', ..., '0.', 'Do', 'you'],
Input references: [["It's", 'completely', 'understandable', 'to', 'feel', 'shame', 'or', 'embarrassment', 'around', 'menstruation.', 'For', 'centuries,', 'societal', 'norms', 'have', 'often', 'shrouded', 'periods', 'in', 'secrecy', 'and', 'negativity,', 'leading', 'many', 'women', 'to', 'internalize', 'these', 'feelings.', 'But', 'the', 'truth', 'is,', 'menstruation', 'is', 'a', 'natural,', 'healthy', 'bodily', 'function,', 'a', 'sign', 'of', 'reproductive', 'health,', 'and', 'nothing', 'to', 'be', 'ashamed', 'of.', "Let's", 