<a href="https://colab.research.google.com/github/ramyahramzy/Colab/blob/main/BookEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0) Install

In [1]:
pip install pypdf ebooklib sentence-transformers faiss-cpu unidecode

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting ebooklib
  Downloading ebooklib-0.19-py3-none-any.whl.metadata (4.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->se

In [2]:
import os, re, math, json
from typing import List, Dict
from pypdf import PdfReader
from ebooklib import epub
from bs4 import BeautifulSoup
from unidecode import unidecode

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

############################################
# 1) EXTRACT TEXT
############################################

In [3]:
def extract_pdf_text(path: str) -> List[Dict]:
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        txt = page.extract_text() or ""
        pages.append({"page": i, "text": txt})
    return pages

def extract_epub_text(path: str) -> List[Dict]:
    book = epub.read_epub(path)
    pages = []
    i = 1
    for item in book.get_items():
        if item.get_type() == 9:  # DOCUMENT
            soup = BeautifulSoup(item.get_body_content(), "html.parser")
            txt = soup.get_text(" ", strip=True)
            # split on large gaps to simulate pages/sections
            for seg in re.split(r"\n{2,}", txt):
                if seg.strip():
                    pages.append({"page": i, "text": seg.strip()})
                    i += 1
    return pages

# pick one:
# pages = extract_pdf_text("my_book.pdf")
# pages = extract_epub_text("my_book.epub")

############################################
# 2) CLEAN + CHUNK
############################################

In [4]:
def clean_text(s: str) -> str:
    # Normalize unicode; lightly clean line breaks/hyphenation/page numbers
    s = s.replace("-\n", "")            # fix hyphenated line breaks
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n{2,}", "\n\n", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def chunk_text(text: str, chunk_words=800, overlap_words=150) -> List[str]:
    words = text.split()
    if not words:
        return []
    chunks = []
    step = max(1, chunk_words - overlap_words)
    for start in range(0, len(words), step):
        chunk = " ".join(words[start:start + chunk_words])
        if len(chunk.split()) < 30:  # skip tiny tails
            continue
        chunks.append(chunk)
        if start + chunk_words >= len(words):
            break
    return chunks

def build_corpus(pages: List[Dict], book_title="My Book") -> List[Dict]:
    docs = []
    for p in pages:
        txt = clean_text(p["text"])
        for i, ch in enumerate(chunk_text(txt), start=1):
            docs.append({
                "id": f"{book_title}_p{p['page']}_c{i}",
                "text": ch,
                "metadata": {"book": book_title, "page": p["page"], "chunk": i}
            })
    return docs


In [5]:
# 0) Install
!pip install pypdf ebooklib sentence-transformers faiss-cpu unidecode

import os, re, math, json
from typing import List, Dict
from pypdf import PdfReader
from ebooklib import epub
from bs4 import BeautifulSoup
from unidecode import unidecode

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer



In [6]:
############################################
# 3) EMBED
############################################

# Choose ONE of the models below:
model_name = "sentence-transformers/all-MiniLM-L6-v2"               # English
# model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # Arabic+English

model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:


############################################
# 1) EXTRACT TEXT
############################################

def extract_pdf_text(path: str) -> List[Dict]:
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        txt = page.extract_text() or ""
        pages.append({"page": i, "text": txt})
    return pages

def extract_epub_text(path: str) -> List[Dict]:
    book = epub.read_epub(path)
    pages = []
    i = 1
    for item in book.get_items():
        if item.get_type() == 9:  # DOCUMENT
            soup = BeautifulSoup(item.get_body_content(), "html.parser")
            txt = soup.get_text(" ", strip=True)
            # split on large gaps to simulate pages/sections
            for seg in re.split(r"\n{2,}", txt):
                if seg.strip():
                    pages.append({"page": i, "text": seg.strip()})
                    i += 1
    return pages

# pick one:
pages = extract_pdf_text("my_book.pdf")
# pages = extract_epub_text("my_book.epub")

############################################
# 2) CLEAN + CHUNK
############################################

def clean_text(s: str) -> str:
    # Normalize unicode; lightly clean line breaks/hyphenation/page numbers
    s = s.replace("-\n", "")            # fix hyphenated line breaks
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n{2,}", "\n\n", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def chunk_text(text: str, chunk_words=800, overlap_words=150) -> List[str]:
    words = text.split()
    if not words:
        return []
    chunks = []
    step = max(1, chunk_words - overlap_words)
    for start in range(0, len(words), step):
        chunk = " ".join(words[start:start + chunk_words])
        if len(chunk.split()) < 30:  # skip tiny tails
            continue
        chunks.append(chunk)
        if start + chunk_words >= len(words):
            break
    return chunks

def build_corpus(pages: List[Dict], book_title="My Book") -> List[Dict]:
    docs = []
    for p in pages:
        txt = clean_text(p["text"])
        for i, ch in enumerate(chunk_text(txt), start=1):
            docs.append({
                "id": f"{book_title}_p{p['page']}_c{i}",
                "text": ch,
                "metadata": {"book": book_title, "page": p["page"], "chunk": i}
            })
    return docs



def embed_texts(texts: List[str]) -> np.ndarray:
    vecs = model.encode(texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
    # normalize_embeddings=True ensures L2 norm = 1 → cosine via dot product
    return np.array(vecs, dtype="float32")

############################################
# 4) INDEX (FAISS)
############################################

def build_faiss_index(embeddings: np.ndarray):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)  # inner product; with normalized vectors ≈ cosine
    index.add(embeddings)
    return index

############################################
# 5) QUERY
############################################

def search(index, docs: List[Dict], query: str, top_k=5):
    qvec = embed_texts([query])
    D, I = index.search(qvec, top_k)  # distances (similarities), indices
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        hit = docs[idx].copy()
        hit["score"] = float(score)
        results.append(hit)
    return results



In [8]:
############################################
# 6) PUT IT TOGETHER
############################################

# 1) Load the book
pages = extract_pdf_text("my_book.pdf")
pages

[{'page': 1, 'text': ' \n'},
 {'page': 2,
  'text': 'PENGUIN   READERS  2000  \n \n \n  \n \n  \n \nwww.penguinreaders.com'},
 {'page': 3,
  'text': ' \n \nThe Adventures of                 \nTom Sawyer \n \nMARK TWAIN \nLevel 1 \n \nRetold by Jacqueline Kehl                                                    \nSeries Editors: Andy Hopkins and Jocelyn Potter '},
 {'page': 4,
  'text': 'Pearson Education Limited                                                                            \nEdinburgh Gate, Harlow,                                                                               \nEssex CM20 2JE, England                                                                              \nand Associated Companies throughout the world. \nISBN 0 582 41923 9 \n \nFirst published 1876                                                                                  \nPublished by Puffin Books 1950                                                                         \nThis edition first 

In [9]:

# OR
# pages = extract_epub_text("/path/to/book.epub")
# Example placeholder:
# pages = [
#     {"page": 1, "text": """Chapter 1. Introduction. This is the first chapter of the book. It introduces the main concepts and ideas that will be discussed in
#     the following chapters. The purpose of this chapter is to provide a foundational understanding of the topic. We will cover the history of the subject,
#     the current state of research, and the future directions. This chapter is essential for anyone who wants to understand the rest of the book.
#     It is recommended to read this chapter carefully before moving on to the next chapters. We will also provide some examples to illustrate the concepts.
#     This chapter is designed to be accessible to beginners, but it also contains information that will be useful for experts. We hope you enjoy reading
#     this chapter and that it helps you to understand the rest of the book."""},
#     {"page": 2, "text": "Chapter 2. Further details and concepts. This chapter delves deeper into the concepts introduced in Chapter 1. We will explore advanced topics and provide more detailed explanations. This chapter assumes that you have read and understood Chapter 1. We will discuss various theories and models related to the topic. We will also present some case studies to illustrate the practical applications of the concepts. This chapter is intended for readers who want to gain a more comprehensive understanding of the subject. We will provide references to additional resources for further reading. This chapter is more challenging than Chapter 1, but it is essential for anyone who wants to become an expert in the field. We hope you find this chapter informative and insightful."},
#     {"page": 3, "text": "Chapter 3. Applications. This chapter focuses on the practical applications of the concepts discussed in the previous chapters. We will show how these concepts can be applied to solve real-world problems. This chapter is intended for readers who are interested in the practical aspects of the subject. We will present various examples and case studies from different domains. We will also discuss the challenges and limitations of applying these concepts in practice. This chapter is designed to be hands-on and practical. We will provide code examples and tutorials to help you get started. We hope you find this chapter useful and that it helps you to apply the concepts to your own projects."}
# ]


# 2) Build corpus
docs = build_corpus(pages, book_title="Sample Book")

# 3) Embed & index
emb = embed_texts([d["text"] for d in docs])
index = build_faiss_index(emb)

# 4) Query
query = "explain the main idea of chapter one"
hits = search(index, docs, query, top_k=5)

print("\nTop matches:")
for h in hits:
    m = h["metadata"]
    print(f"- score={h['score']:.3f} | page {m['page']} chunk {m['chunk']} | {h['text'][:120]}...")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top matches:
- score=0.381 | page 32 chunk 1 | ACTIVITIES Chapters 1-6 Before you read 1 Find the words in italics in your dictionary. They are all in the story. a Ans...
- score=0.290 | page 11 chunk 1 | killed the doctor with a knife. Tom and Huck watched. Then they went away quickly because they were afraid. They went to...
- score=0.269 | page 13 chunk 1 | look at Tom. She went into school. Tom walked away. He didn’t want to go to school now. He was very sad. Joe Harper was ...
- score=0.269 | page 8 chunk 1 | Tom stayed in the yard, and the boys painted. They painted the fence three times. It was beautiful and white. Tom went i...
- score=0.266 | page 35 chunk 1 | Tom Sawyer loves adventures. He has them at home, at school, and with his friends—Huck Finn and Joe Harper. Tom has one ...


############################################
# 6) PUT IT TOGETHER
############################################

In [None]:
# 1) Load the book
pages = extract_pdf_text("book.pdf")
# OR
# pages = extract_epub_text("/path/to/book.epub")
# Example placeholder:
# pages = [
#     {"page": 1, "text": """Chapter 1. Introduction. This is the first chapter of the book. It introduces the main concepts and ideas that will be discussed in
#     the following chapters. The purpose of this chapter is to provide a foundational understanding of the topic. We will cover the history of the subject,
#     the current state of research, and the future directions. This chapter is essential for anyone who wants to understand the rest of the book.
#     It is recommended to read this chapter carefully before moving on to the next chapters. We will also provide some examples to illustrate the concepts.
#     This chapter is designed to be accessible to beginners, but it also contains information that will be useful for experts. We hope you enjoy reading
#     this chapter and that it helps you to understand the rest of the book."""},
#     {"page": 2, "text": "Chapter 2. Further details and concepts. This chapter delves deeper into the concepts introduced in Chapter 1. We will explore advanced topics and provide more detailed explanations. This chapter assumes that you have read and understood Chapter 1. We will discuss various theories and models related to the topic. We will also present some case studies to illustrate the practical applications of the concepts. This chapter is intended for readers who want to gain a more comprehensive understanding of the subject. We will provide references to additional resources for further reading. This chapter is more challenging than Chapter 1, but it is essential for anyone who wants to become an expert in the field. We hope you find this chapter informative and insightful."},
#     {"page": 3, "text": "Chapter 3. Applications. This chapter focuses on the practical applications of the concepts discussed in the previous chapters. We will show how these concepts can be applied to solve real-world problems. This chapter is intended for readers who are interested in the practical aspects of the subject. We will present various examples and case studies from different domains. We will also discuss the challenges and limitations of applying these concepts in practice. This chapter is designed to be hands-on and practical. We will provide code examples and tutorials to help you get started. We hope you find this chapter useful and that it helps you to apply the concepts to your own projects."}
# ]


# 2) Build corpus
docs = build_corpus(pages, book_title="Sample Book")

# 3) Embed & index
emb = embed_texts([d["text"] for d in docs])
index = build_faiss_index(emb)




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top matches:
- score=0.743 | page 5 chunk 1 | Introduction One Saturday afternoon Tom wanted to have an adventure because he didn’t want to think about Injun Joe. He ...
- score=0.692 | page 35 chunk 1 | Tom Sawyer loves adventures. He has them at home, at school, and with his friends—Huck Finn and Joe Harper. Tom has one ...
- score=0.533 | page 6 chunk 1 | Chapter 1 The Fence Tom Sawyer lived with his aunt because his mother and father were dead. Tom didn’t like going to sch...
- score=0.496 | page 26 chunk 1 | The two children walked and walked. But they didn’t find the door to the cave. Becky was afraid. She wanted to sit down ...
- score=0.495 | page 18 chunk 1 | Tom wanted to help her. Suddenly he said, “I did it. I tore your book.” “Tom Sawyer, you’re a very bad boy. Stay here af...


# 4) Query

In [10]:
query = "Why did Tom Sawyer lie with his aunt ?"
hits = search(index, docs, query, top_k=5)

print("\nTop matches:")
for h in hits:
    m = h["metadata"]
    print(f"- score={h['score']:.3f} | page {m['page']} chunk {m['chunk']} | {h['text'][:120]}...")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top matches:
- score=0.622 | page 18 chunk 1 | Tom wanted to help her. Suddenly he said, “I did it. I tore your book.” “Tom Sawyer, you’re a very bad boy. Stay here af...
- score=0.615 | page 5 chunk 1 | Introduction One Saturday afternoon Tom wanted to have an adventure because he didn’t want to think about Injun Joe. He ...
- score=0.603 | page 9 chunk 1 | “I’m late because I talked to Huck Finn,” Tom said. Then the teacher was very angry. “Sit with the girls,” he said to To...
- score=0.566 | page 35 chunk 1 | Tom Sawyer loves adventures. He has them at home, at school, and with his friends—Huck Finn and Joe Harper. Tom has one ...
- score=0.563 | page 6 chunk 1 | Chapter 1 The Fence Tom Sawyer lived with his aunt because his mother and father were dead. Tom didn’t like going to sch...
