<div style="background-color:#f0f8ff; padding:10px; border:1px solid #ccc; border-radius:5px; font-size:30px;">
Overview
</div>

This notebook demonstrates a simple end-to-end pipeline for converting PDF documents into searchable vector representations using FAISS and multilingual sentence embeddings.

Workflow:
- Extract text from PDF files
- Generate sentence embeddings with a multilingual model
- Store vectors in a FAISS index
- Perform semantic search using a natural language query


<div style="background-color:#f0f8ff; padding:10px; border:1px solid #ccc; border-radius:5px; font-size:30px;">
Indexing
</div>

### Extracting Text from PDF	

In [14]:
import os
import glob
import logging
import pdfplumber

# Suppress pdfminer warnings
logging.getLogger("pdfminer").setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Directory containing PDF files
pdf_dir = "../data/input/pdf/"
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

print(f"Number of PDF files: {len(pdf_files)}")

# List to store all extracted chunks
all_chunks = []

# Process each PDF file
for file_path in pdf_files:
    chunk_count = 0
    try:
        with pdfplumber.open(file_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    all_chunks.append({
                        "chunk_id": f"{os.path.basename(file_path)}_page_{i+1}",
                        "text": text.strip(),
                        "page": i + 1,
                        "file_path": file_path,
                    })
                    chunk_count += 1
        print(f"{os.path.basename(file_path)}: {chunk_count} chunks extracted")
    except Exception as e:
        print(f"Error reading {os.path.basename(file_path)}: {e}")

# Summary of total chunks
print(f"\nTotal number of chunks: {len(all_chunks)}")

Number of PDF files: 3
chatbot_ip_protection-EA0125007ENN.pdf: 1 chunks extracted
CN-2023-0012.pdf: 2 chunks extracted
a-short-guide.pdf: 12 chunks extracted

Total number of chunks: 15


In [15]:
# Preview the first chunk if available
first = all_chunks[0]
print(f"\nFirst chunk - page {first['page']} ({os.path.basename(first['file_path'])}):\n")
print(first['text'][:300])


First chunk - page 1 (chatbot_ip_protection-EA0125007ENN.pdf):

CASE STUDY Chatbot IP Protection
Background
The Swedish start-up Poseidon owns the word trade mark in Sweden. They specialise in AI-based B2B manufacturing services. The company
uses a specially trained chatbot to meet the needs of its customers and has a particular method for carrying out its activ


In [16]:
import pandas as pd

df = pd.DataFrame(all_chunks)
df

Unnamed: 0,chunk_id,text,page,file_path
0,chatbot_ip_protection-EA0125007ENN.pdf_page_1,CASE STUDY Chatbot IP Protection\nBackground\n...,1,../data/input/pdf/chatbot_ip_protection-EA0125...
1,CN-2023-0012.pdf_page_1,A Simple PDF File\nCN-2023-0012\nThis is a sma...,1,../data/input/pdf/CN-2023-0012.pdf
2,CN-2023-0012.pdf_page_2,Simple PDF File 2\n...continued from page 1. Y...,2,../data/input/pdf/CN-2023-0012.pdf
3,a-short-guide.pdf_page_1,A SHORT GUIDE\nEdition 1(b) May 2024,1,../data/input/pdf/a-short-guide.pdf
4,a-short-guide.pdf_page_2,A Short Guide\nDublin Airport can be a busy pl...,2,../data/input/pdf/a-short-guide.pdf
5,a-short-guide.pdf_page_3,Contents\n1. Finding Your Way Around\n2. Arriv...,3,../data/input/pdf/a-short-guide.pdf
6,a-short-guide.pdf_page_4,A SHORT GUIDE\n1. Finding Your Way Around\nThe...,4,../data/input/pdf/a-short-guide.pdf
7,a-short-guide.pdf_page_5,A SHORT GUIDE\n3. Checking In\nYou may have ch...,5,../data/input/pdf/a-short-guide.pdf
8,a-short-guide.pdf_page_6,A SHORT GUIDE\n4. Security Screening\nOnce you...,6,../data/input/pdf/a-short-guide.pdf
9,a-short-guide.pdf_page_7,A SHORT GUIDE\n5. Your Boarding/Departure Gate...,7,../data/input/pdf/a-short-guide.pdf


## Embedding Text into Vectors	

In [17]:
from pathlib import Path
import glob
import pickle
import pdfplumber
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Define input and output paths
pdf_dir = Path("../data/input/pdf")
index_path = Path("../data/faiss/faiss.index")
meta_path = Path("../data/faiss/faiss_meta.pkl")
index_path.parent.mkdir(parents=True, exist_ok=True)

# Load multilingual embedding model (supports Japanese and English)
model = SentenceTransformer("intfloat/multilingual-e5-small")

# Collect all PDF file paths
pdf_paths = sorted(pdf_dir.glob("*.pdf"))
print(f"Number of PDF files: {len(pdf_paths)}")

# Extract text chunks from PDFs
all_chunks = []

for pdf_path in pdf_paths:
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    all_chunks.append({
                        "chunk_id": f"{pdf_path.name}_page_{i+1}",
                        "text": text.strip(),
                        "page": i + 1,
                        "file_path": str(pdf_path)
                    })
        print(f"{pdf_path.name}: extracted {len(pdf.pages)} pages")
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {e}")

print(f"Total number of chunks: {len(all_chunks)}")

# Encode text to embeddings
texts = [chunk["text"] for chunk in all_chunks]
embeddings = model.encode(texts, normalize_embeddings=True)

# Add embedding vectors to each chunk
for chunk, emb in zip(all_chunks, embeddings):
    chunk["embedding"] = emb

# Create and populate FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
vecs = np.array([chunk["embedding"] for chunk in all_chunks]).astype("float32")
index.add(vecs)

print(f"FAISS index size: {index.ntotal} vectors")

# Save FAISS index
faiss.write_index(index, str(index_path))
print(f"Saved FAISS index to: {index_path}")

# Remove embeddings from metadata to reduce size
for chunk in all_chunks:
    chunk["embedding"] = None

# Save chunk metadata as pickle
with open(meta_path, "wb") as f:
    pickle.dump(all_chunks, f)

print(f"Saved metadata to: {meta_path}")


Number of PDF files: 3
CN-2023-0012.pdf: extracted 2 pages
a-short-guide.pdf: extracted 12 pages
chatbot_ip_protection-EA0125007ENN.pdf: extracted 1 pages
Total number of chunks: 15
FAISS index size: 15 vectors
Saved FAISS index to: ../data/faiss/faiss.index
Saved metadata to: ../data/faiss/faiss_meta.pkl


<div style="background-color:#f0f8ff; padding:10px; border:1px solid #ccc; border-radius:5px; font-size:30px;">
Semantic Search
</div>

In [18]:
import faiss
import pickle
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load FAISS index and metadata
index_path = "../data/faiss/faiss.index"
meta_path = "../data/faiss/faiss_meta.pkl"

index = faiss.read_index(index_path)

with open(meta_path, "rb") as f:
    chunks = pickle.load(f)

# Load embedding model (multilingual)
model = SentenceTransformer("intfloat/multilingual-e5-small")

# Define search query
query_text = "airport"

# Encode the query to an embedding vector
query_vec = model.encode([query_text], normalize_embeddings=True).astype("float32")

# Search top-k results in FAISS index
top_k = 5
distances, indices = index.search(query_vec, top_k)

# Format results
results = []
for idx, dist in zip(indices[0], distances[0]):
    if idx < len(chunks):
        results.append({
            "page": chunks[idx]["page"],
            "file_path": chunks[idx]["file_path"],
            "text_preview": chunks[idx]["text"][:150] + "...",
            "score_l2": float(dist)
        })

# Convert results to DataFrame for display
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,page,file_path,text_preview,score_l2
0,11,../data/input/pdf/a-short-guide.pdf,A SHORT GUIDE\nDublin Airport Short and Long T...,0.323328
1,2,../data/input/pdf/a-short-guide.pdf,A Short Guide\nDublin Airport can be a busy pl...,0.347784
2,12,../data/input/pdf/a-short-guide.pdf,A SHORT GUIDE\nDublin Airport Terminals Map\nG...,0.372768
3,8,../data/input/pdf/a-short-guide.pdf,A SHORT GUIDE\n6.\nWashrooms\nThere are washro...,0.381355
4,3,../data/input/pdf/a-short-guide.pdf,Contents\n1. Finding Your Way Around\n2. Arriv...,0.382499
