### Install and import dependencies

In [None]:
!pip install langchain faiss-cpu sentence-transformers pymupdf



In [2]:
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os, re


### Section extraction and normalization from PDFs

This cell defines regular expressions to detect common section titles in scientific papers (e.g., *Introduction*, *Methods*, *Results*, etc.), even when preceded by numbering (e.g., "1. Introduction").  
The `normalize_section_title` function is used to **standardize these titles**, mapping variations to canonical categories (e.g., "1. Introduction", "Background" → "introduction").  
This helps to semantically organize and index the content more effectively in the downstream RAG pipeline.


In [None]:
pdf_dir = "../data/Publications"

section_patterns = [
    r"(?:^\d+[\.\)]?\s*)?abstract",
    r"(?:^\d+[\.\)]?\s*)?introduction",
    r"(?:^\d+[\.\)]?\s*)?background",
    r"(?:^\d+[\.\)]?\s*)?materials and methods",
    r"(?:^\d+[\.\)]?\s*)?methods?",
    r"(?:^\d+[\.\)]?\s*)?results",
    r"(?:^\d+[\.\)]?\s*)?results and discussion",
    r"(?:^\d+[\.\)]?\s*)?discussion",
    r"(?:^\d+[\.\)]?\s*)?conclusions?",
    r"(?:^\d+[\.\)]?\s*)?references"
]
section_regex = re.compile("|".join(section_patterns), re.IGNORECASE)

def normalize_section_title(raw_title: str) -> str:
    text = raw_title.lower().strip()

    # Rimuove numbers/dots (example. "1. ", "2)", "3.1 ")
    text = re.sub(r"^\d+(\.\d+)?[\.\)]?\s*", "", text)

    # Explicit mapping
    if "abstract" in text:
        return "abstract"
    elif "introduction" in text:
        return "introduction"
    elif "background" in text:
        return "introduction"
    elif "material" in text or "method" in text:
        return "materials and methods"
    elif "result" in text or "discussion" in text:
        return "results and discussion"
    elif "conclusion" in text:
        return "conclusion"
    elif "reference" in text:
        return "references"
    else:
        return "other"



### Section-wise parsing of scientific PDFs

Loops through all PDF files in the specified folder and uses `PyMuPDFLoader` to extract text page by page.  
It then scans each line for section headers (like "1. Introduction", "Results and Discussion", etc.) using the previously defined regular expressions.

When a section header is detected:
- The content collected so far is saved as a `Document` object with metadata (`source` and `section`).
- The parser starts buffering the new section's content.

At the end of each file, the final section is saved as well.  
The result is a list of semantically segmented sections from your papers — stored in `sectioned_docs` — ready for chunking and indexing.


In [4]:
sectioned_docs = []

for filename in os.listdir(pdf_dir):
    if not filename.endswith(".pdf"):
        continue

    print("File:",filename)
    loader = PyMuPDFLoader(os.path.join(pdf_dir, filename))
    pages = loader.load()
    #break

    current_section = None
    buffer = ""

    for page in pages:
        text = page.page_content.strip()
        lines = text.split("\n")

        for line in lines:
            clean = line.strip().lower()
            section_title = normalize_section_title(clean)

            # Check section title
            if re.fullmatch(section_regex, clean):
                print("Section found:", clean, "Section mapping:",section_title)
                # If there is an open section, save it
                if current_section:
                    sectioned_docs.append(Document(
                        page_content=buffer.strip(),
                        metadata={"source": filename, "section": section_title}
                    ))
                # Start new section
                current_section = section_title
                buffer = ""
            else:
                buffer += line + "\n"

    # Last section at the end of the file
    if current_section and buffer:
        sectioned_docs.append(Document(
            page_content=buffer.strip(),
            metadata={"source": filename, "section": current_section}
        ))

print(f"Extracted section: {len(sectioned_docs)}")


File: 8.Resveratrol Analogues as Dual Inhibitors of Monoamine Oxidase B and Carbonic Anhydrase VII A New Multi-Target Combination for Neurodegenerative Diseases.pdf
Section found: 1. introduction Section mapping: introduction
Section found: 2. results and discussion Section mapping: results and discussion
Section found: 3. materials and methods Section mapping: materials and methods
Section found: 4. conclusions Section mapping: conclusion
Section found: references Section mapping: references
File: 1.Development of a cheminformatics platform for selectivity analyses of carbonic anhydrase inhibitors.pdf
Section found: abstract Section mapping: abstract
Section found: introduction Section mapping: introduction
Section found: materials and methods Section mapping: materials and methods
Section found: results and discussion Section mapping: results and discussion
Section found: conclusions Section mapping: conclusion
Section found: references Section mapping: references
File: 7.Machine Lea

### Chunking, embedding, and FAISS index creation

📦 Final output: a serialized FAISS vector index saved locally ready to support RAG-based querying.


**Text Chunking**  
   It uses `RecursiveCharacterTextSplitter` to break down each section (from `documents`) into manageable chunks of 1000 characters, with 200-character overlap to preserve context across boundaries.

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
docs = text_splitter.split_documents(sectioned_docs)
print(f"📄 Documents splitted in {len(docs)} chunk")

📄 Documents splitted in 465 chunk


In [7]:
#Verifies if a GPU is available using PyTorch, which helps speed up embedding when supported.
import torch
print("Is CUDA available?", torch.cuda.is_available())


Is CUDA available? True


**Embedding Model Selection**  
   Three embedding models are suggested:
   - `MiniLM-L6-v2`: fast and lightweight, great for testing
   - `BAAI/bge-small-en-v1.5`: optimized for semantic search and QA tasks (recommended for most use cases)
   - `intfloat/e5-large-v2`: larger and more powerful, ideal for RAG production

   In this setup, the **BGE-small model** is used, with normalized embeddings and loaded on GPU (`device="cuda"`).

**FAISS Index Construction**  
   The chunks are embedded and indexed using FAISS — a fast similarity search library.  
   The resulting `db` object allows for efficient semantic retrieval later on.

In [None]:
VECTOR_PATH = "../vectorstore"

#For fast tests
#embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#For Q/A
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cuda"})

#For RAG production
#embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2", encode_kwargs={"normalize_embeddings": True})

#FAISS
db = FAISS.from_documents(docs, embedding)

#Save the FAISS index lovally for later use
db.save_local(VECTOR_PATH)
print(f"🔍 FAISS index saved in '{VECTOR_PATH}'")

🔍 FAISS index saved in '/content/faiss_index'
