In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import requests
import pymupdf
import io 
import re
import nltk
nltk.download('punkt_tab', download_dir='./punkt')  # Ensure that NLTK's sentence tokenizer is available
from nltk.tokenize import sent_tokenize
nltk.data.path.append('./punkt')
import psycopg2
import hashlib
import os
import faiss
import numpy as np
from scipy.spatial.distance import cosine
from dotenv import load_dotenv
from openai import OpenAI
from collections import defaultdict

[nltk_data] Downloading package punkt_tab to ./punkt...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Config Start ---------------------------------------------------------------------------------------------------------------------------

url = "https://www.who.int/europe/publications/i"

load = "yes" # change to no, if you don't want to load data 
pages = 1
load_dotenv()

# api_key = os.getenv('OPENAI_API_KEY')
# client = OpenAI(api_key=HF_TOKEN)

# Use HF_TOKEN instead of OpenAI key
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise RuntimeError("HF_TOKEN not set in environment. Add HF_TOKEN=your_token to your .env")

# Choose a sentence-embedding model available on HF. Example: all-mpnet-base-v2
HF_EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2")
HF_API_BASE = "https://api-inference.huggingface.co/pipeline/feature-extraction"
HF_API_URL = f"{HF_API_BASE}/{HF_EMBEDDING_MODEL}"

# Session for requests (re-uses connections, sets headers)
requests_session = requests.Session()
requests_session.headers.update({
    "Authorization": f"Bearer {HF_TOKEN}",
    "User-Agent": "who-rag-bot/0.1 (+contact@example.com)"
})

# Config End ----------------------------------------------------------------------------------------------------------------------------

In [None]:
# Ensure punkt tokenizer is present (download once)
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

In [3]:
def make_driver():
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    
    # ChromeDriverManager automatically downloads/sets up ChromeDriver
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=opts)
    return driver


# def make_driver():
#     opts = Options()
#     opts.add_argument("--headless=new")
#     opts.add_argument("--no-sandbox")
#     opts.add_argument("--disable-dev-shm-usage")
#     # add any other opts you need (download dir if you plan to actually download files etc.)
#     driver = webdriver.Chrome(ChromeDriverManager().install(), options=opts)
#     return driver


def fetch_links(start: int, end: int, url: str, pause: int) -> set:
    """
    Fetch pdf links from paginated WHO listing page (simple approach).
    """
    driver = make_driver()
    driver.get(url)
    download_links = set()

    for page_number in range(start, end + 1):
        try:
            input_field = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'input.k-textbox'))
            )

            input_field.clear()
            input_field.send_keys(str(page_number))

            next_page_button = driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Go to the next page"]')
            next_page_button.click()

            time.sleep(pause)

            links = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
            for link in links:
                href = link.get_attribute("href")
                if href:
                    download_links.add(href)
            print(f"Loaded page {page_number}... {len(download_links)} in set.")

        except Exception as e:
            print(f"An error occurred on page {page_number}: {e}")
            break

    driver.quit()
    return download_links


def fetch_pdf(link: str) -> str:
    """
    Download PDF via requests and extract text using PyMuPDF (fitz).
    """
    resp = requests_session.get(link, timeout=60)
    resp.raise_for_status()
    filestream = io.BytesIO(resp.content)
    with fitz.open(stream=filestream, filetype="pdf") as doc:
        text = []
        for page in doc:
            text.append(page.get_text("text"))
    return "\n".join(text)


def split_large_paragraph(paragraph: str, max_words: int, overlap: int) -> list:
    """
    Break a large paragraph into word-windowed chunks using sentence tokenization for better boundaries.
    Returns list[str].
    """
    sentences = sent_tokenize(paragraph)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = sentence.split()
        sentence_len = len(words_in_sentence)

        if current_word_count + sentence_len > max_words and current_chunk:
            chunks.append(" ".join(current_chunk))
            # keep overlap words (from end)
            if overlap > 0:
                tail = " ".join(current_chunk).split()[-overlap:]
                current_chunk = tail.copy()
                current_word_count = len(current_chunk)
            else:
                current_chunk = []
                current_word_count = 0

        current_chunk.extend(words_in_sentence)
        current_word_count += sentence_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def split_into_chunks(text: str, max_words=512, overlap=50) -> list:
    """
    Heuristic splitter that recognizes headings/bullets and then uses sentence/window chunking.
    Returns list of dicts: {section, type, content}
    """
    heading_pattern = r'^(?P<heading>[A-Z].+):$'
    bullet_point_pattern = r'^\s*[-•]\s*(?P<bullet>.+)'

    lines = text.split('\n')

    chunks = []
    current_chunk = []
    current_section = None

    for line in lines:
        line = line.strip()
        if not line:
            continue

        heading_match = re.match(heading_pattern, line)
        if heading_match:
            if current_chunk:
                paragraph = ' '.join(current_chunk)
                for sub_chunk in split_large_paragraph(paragraph, max_words, overlap):
                    chunks.append({'section': current_section, 'type': 'paragraph', 'content': sub_chunk})
                current_chunk = []

            current_section = heading_match.group('heading')
            chunks.append({'section': current_section, 'type': 'heading', 'content': line})
            continue

        bullet_match = re.match(bullet_point_pattern, line)
        if bullet_match:
            if current_chunk:
                paragraph = ' '.join(current_chunk)
                for sub_chunk in split_large_paragraph(paragraph, max_words, overlap):
                    chunks.append({'section': current_section, 'type': 'paragraph', 'content': sub_chunk})
                current_chunk = []

            chunks.append({'section': current_section, 'type': 'bullet', 'content': bullet_match.group('bullet')})
            continue

        current_chunk.append(line)

    if current_chunk:
        paragraph = ' '.join(current_chunk)
        for sub_chunk in split_large_paragraph(paragraph, max_words, overlap):
            chunks.append({'section': current_section, 'type': 'paragraph', 'content': sub_chunk})

    return chunks


def create_vector(texts: list, batch_size: int = 8) -> list:
    """
    Use Hugging Face Inference API (feature-extraction pipeline) to obtain embeddings.
    - texts: list[str]
    - returns: list[list[float]] (one embedding per input)
    NOTE: model dimension depends on HF_EMBEDDING_MODEL (update DB/vector store accordingly)
    """
    embeddings = []
    if not texts:
        return []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # HF pipeline accepts either a single input or list of inputs
        resp = requests_session.post(HF_API_URL, json={"inputs": batch}, timeout=120)
        resp.raise_for_status()
        data = resp.json()
        # data should be a list of embedding lists (one per input)
        # Some HF endpoints might return a single embedding if input is single; normalize:
        if isinstance(data, list) and isinstance(data[0], list):
            embeddings.extend(data)
        else:
            # fallback: treat as one embedding per input
            embeddings.append(data)

    return embeddings


def hash_text(text: str) -> str:
    """
    SHA256 hex digest for a single string.
    """
    return hashlib.sha256(text.encode("utf-8")).hexdigest()


def hash_texts(texts: list) -> list:
    return [hash_text(t) for t in texts]

In [None]:
# # Example usage (pseudo):
# if __name__ == "__main__":
#     links = fetch_links(1, PAGES, URL, pause=2)
#     for link in links:
#         text = fetch_pdf(link)
#         chunks = split_into_chunks(text, max_words=512, overlap=50)
#         texts = [c["content"] for c in chunks if c["type"] != "heading"]
#         emb = create_vector(texts)
#         hashes = hash_texts(texts)
#         print("Got", len(emb), "embeddings and", len(hashes), "hashes")

In [4]:
from huggingface_hub import InferenceClient

hf_client = InferenceClient(token=HF_TOKEN)

In [None]:
# def create_vector(chunks: list) -> list:
#     """
#     Create vectors from text using Hugging Face embeddings.
#     """
#     # Example model: sentence-transformers/all-MiniLM-L6-v2
#     # This produces 384-dimensional embeddings (so adjust DB schema!)
#     response = hf_client.feature_extraction(
#         model="sentence-transformers/all-MiniLM-L6-v2",
#         inputs=chunks
#     )
    
#     # Response will be a list of embeddings for each input
#     if isinstance(response[0], list):  
#         return response[0]  # return the first vector if single text
#     return response

In [6]:
def ingest_to_database(chunks: list, group_id: int) -> None:
    """
    Connect to Postgres, ensure table exists,
    batch-embed chunks and ingest them.
    """
    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        database="llm",
        user="admin",
        password="admin"
    )
    cur = conn.cursor()

    # Adjust VECTOR dimension to match your HF model (MiniLM = 384)
    cur.execute('''
    CREATE TABLE IF NOT EXISTS text_chunks (
        id SERIAL PRIMARY KEY,
        section_content TEXT,
        type TEXT,
        vector VECTOR(384),  -- match embedding dimension
        hash TEXT UNIQUE,
        group_id INT,
        timestamp TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
    );
    ''')
    conn.commit()

    # Prepare texts for embedding
    texts = [(chunk['section'] or "") + " " + chunk['content'] for chunk in chunks]
    vectors = create_vectors(texts)

    # Insert all chunks
    for i, chunk in enumerate(chunks):
        section_content = texts[i]
        chunk_type = chunk['type']
        vector = vectors[i]
        hash_value = hash_text(section_content)

        cur.execute('''
            INSERT INTO text_chunks (section_content, type, vector, hash, group_id)
            VALUES (%s, %s, %s, %s, %s)
            ON CONFLICT (hash) DO NOTHING;
        ''', (section_content, chunk_type, vector, hash_value, group_id))

    conn.commit()
    cur.close()
    conn.close()



In [7]:
if load == "yes":
    links = fetch_links(1,pages,url,5)
    number = 0
    for link in links:
        fetch_file = fetch_pdf(link)
        number += 1
        result = split_into_chunks(fetch_file)
        ingest_to_database(result, number)
        print("Data ingested successfully! Loaded link number " + str(number))


NameError: name 'ChromeDriverManager' is not defined

In [None]:
def generate_query_embedding(query: str) -> list:
    """
    Generate embedding for the user's query (HF version).
    """
    response = hf_client.feature_extraction(
        model="sentence-transformers/all-MiniLM-L6-v2",
        inputs=[query]
    )
    return response[0]  # single vector


def search_similar_chunks_pg(query_vector, top_n=5):
    """
    Search using pgvector cosine similarity directly in SQL.
    """
    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        database="llm",
        user="admin",
        password="admin"
    )
    cur = conn.cursor()

    # Perform similarity search directly in SQL
    cur.execute(
        """
        SELECT id, section_content, group_id, 1 - (vector <=> %s::vector) AS similarity
        FROM text_chunks
        ORDER BY vector <=> %s::vector
        LIMIT %s;
        """,
        (query_vector, query_vector, top_n)
    )

    rows = cur.fetchall()
    cur.close()
    conn.close()

    return rows  # (id, content, group_id, similarity)


def search_similar_chunks_faiss(query_vector, top_n=5):
    """
    Alternative FAISS-based retrieval (if you want to load all vectors).
    Not needed if pgvector is available.
    """
    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        database="llm",
        user="admin",
        password="admin"
    )
    cur = conn.cursor()
    cur.execute('SELECT id, section_content, vector, group_id FROM text_chunks')
    rows = cur.fetchall()
    cur.close()
    conn.close()

    # Build FAISS index
    import faiss
    d = len(query_vector)
    index = faiss.IndexFlatL2(d)

    vectors, ids, contents, groups = [], [], [], []
    for row in rows:
        chunk_id, section_content, vector, group_id = row
        chunk_vector = np.array(vector, dtype=np.float32)
        vectors.append(chunk_vector)
        ids.append(chunk_id)
        contents.append(section_content)
        groups.append(group_id)

    vectors_np = np.vstack(vectors).astype(np.float32)
    index.add(vectors_np)

    query_np = np.array(query_vector, dtype=np.float32).reshape(1, -1)
    distances, indices = index.search(query_np, top_n)

    return [(ids[i], contents[i], float(distances[0][n]), groups[i]) for n, i in enumerate(indices[0])]


In [None]:
def search_similar_chunks_faiss_dot(query_vector, top_n=5):
    """Cosine similarity search using FAISS (via normalized dot-product)."""
    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        database="llm",
        user="admin",
        password="admin"
    )
    cur = conn.cursor()
    cur.execute('SELECT id, section_content, vector, group_id FROM text_chunks')
    rows = cur.fetchall()

    vectors = []
    ids = []
    contents = []
    groups = []

    for row in rows:
        chunk_id, section_content, vector_str, group_id = row
        chunk_vector = parse_pgvector(vector_str)
        chunk_vector = normalize_vector(chunk_vector)
        vectors.append(chunk_vector)
        ids.append(chunk_id)
        contents.append(section_content)
        groups.append(group_id)

    # Build FAISS index
    d = len(query_vector)
    index = faiss.IndexFlatIP(d)  # Inner product index
    vectors_np = np.vstack(vectors).astype(np.float32)
    index.add(vectors_np)

    # Normalize query vector
    query_vector_np = normalize_vector(np.array(query_vector, dtype=np.float32)).reshape(1, -1)

    # Search
    similarities, indices = index.search(query_vector_np, top_n)

    faiss_dot_results = [
        (ids[i], contents[i], float(similarities[0][n]), groups[i])
        for n, i in enumerate(indices[0])
    ]

    cur.close()
    conn.close()

    return faiss_dot_results

In [None]:
def search_similar_chunks_canberra(query_vector, top_n=5):
    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        database="llm",
        user="admin",
        password="admin"
    )
    cur = conn.cursor()
    cur.execute('SELECT id, section_content, vector, group_id FROM text_chunks')
    rows = cur.fetchall()

    similarities = []
    for row in rows:
        chunk_id, section_content, vector_str, group_id = row
        chunk_vector = parse_pgvector(vector_str)  # use the robust parser
        distance = canberra_distance(query_vector, chunk_vector)
        similarities.append((chunk_id, section_content, distance, group_id))

    similarities = sorted(similarities, key=lambda x: x[2])[:top_n]

    cur.close()
    conn.close()
    return similarities


In [None]:
def display_results(results, method_name):
    print(f"\nTop Results ({method_name}):")
    for i, (chunk_id, section_content, similarity, group_id) in enumerate(results, 1):
        snippet = section_content[:300] + ("..." if len(section_content) > 300 else "")
        print(f"{i}. ID: {chunk_id} | Similarity: {similarity:.4f} | Group_id: {group_id}\nText: {snippet}\n")


In [None]:
def compare_top_ids_simple(pg_results, faiss_l2_results, faiss_dot_results, canberra_distance_results, top_n=3):
    id_scores = defaultdict(int)
    id_group_mapping = {}

    def assign_scores(results):
        for rank, result in enumerate(results[:top_n], start=1):
            chunk_id = result[0]
            group_id = result[3]
            score = top_n - rank + 1
            id_scores[chunk_id] += score
            id_group_mapping[chunk_id] = group_id

    for results in [pg_results, faiss_l2_results, faiss_dot_results, canberra_distance_results]:
        assign_scores(results)

    sorted_ids = sorted(id_scores.items(), key=lambda x: -x[1])
    final_output = [(chunk_id, id_group_mapping[chunk_id], score) for chunk_id, score in sorted_ids[:top_n]]
    return final_output


In [None]:
# def get_chunk_by_id_from_db(chunk_id):
#     """Fetch the chunk text directly from the database using the best chunk ID."""
#     # Connect to PostgreSQL
#     conn = psycopg2.connect(
#         host="postgres",
#         port=5432,
#         database="llm",
#         user="admin",
#         password="admin"
#     )
#     cur = conn.cursor()

#     # Query the database for the chunk by its ID
#     cur.execute("SELECT section_content FROM text_chunks WHERE id = %s", (chunk_id,))
#     result = cur.fetchone()

#     cur.close()
#     conn.close()

#     if result:
#         return result[0]  # Return the text content of the chunk
#     else:
#         return None  # In case the chunk ID is not found


In [None]:
from typing import Optional

def get_chunk_by_id_from_db(chunk_id: int) -> Optional[str]:
    """
    Fetch the chunk text directly from the database using the given chunk ID.
    
    Returns:
        The chunk's text content, or None if not found.
    """
    try:
        with psycopg2.connect(
            host="postgres",
            port=5432,
            database="llm",
            user="admin",
            password="admin"
        ) as conn:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("SELECT section_content FROM text_chunks WHERE id = %s"),
                    (chunk_id,)
                )
                result = cur.fetchone()

        if result:
            return result[0]
        return None

    except psycopg2.Error as e:
        print(f"Database error while fetching chunk {chunk_id}: {e}")
        return None