This notebook provides a complete workflow for downloading PDFs, parsing them, creating a LitePali index, and performing searches. It's designed to run from start to finish without requiring user input.

In [1]:
# Install required libraries
!pip install litepali pdf2image PyPDF2 requests

# This cell installs the necessary libraries:
# - litepali: Your custom library for document retrieval
# - pdf2image: For converting PDF pages to images
# - PyPDF2: For parsing PDF metadata
# - requests: For downloading PDFs from URLs



In [2]:
# Import required libraries
import os
import requests
import PyPDF2
from pdf2image import convert_from_bytes
from litepali import LitePali, ImageFile

# This cell imports the necessary libraries and modules we'll use throughout the notebook.

In [3]:
# Define PDF URLs and create base directory
pdf_urls = [
    "https://arxiv.org/pdf/2403.09611.pdf",
    "https://arxiv.org/pdf/2103.00020.pdf",
    "https://arxiv.org/pdf/2407.01449.pdf"
]
base_dir = os.path.join(os.getcwd(), "litepali_data")
os.makedirs(base_dir, exist_ok=True)

# This cell defines the URLs of the PDFs we want to process and creates a base directory
# inside the current notebook folder to store our data.

In [4]:
# Download PDFs
def download_pdf(url, save_dir):
    response = requests.get(url)
    filename = url.split('/')[-1]
    save_path = os.path.join(save_dir, filename)
    with open(save_path, 'wb') as f:
        f.write(response.content)
    return save_path

downloaded_paths = []
for url in pdf_urls:
    url_dir = os.path.join(base_dir, url.split('/')[-1].replace('.pdf', ''))
    os.makedirs(url_dir, exist_ok=True)
    downloaded_paths.append(download_pdf(url, url_dir))

print("Downloaded PDFs:", downloaded_paths)

# This cell defines a function to download PDFs and then downloads each PDF from the specified URLs.
# Each PDF is saved in its own directory within the base directory.

Downloaded PDFs: ['/teamspace/studios/this_studio/litepali_data/2403.09611/2403.09611.pdf', '/teamspace/studios/this_studio/litepali_data/2103.00020/2103.00020.pdf', '/teamspace/studios/this_studio/litepali_data/2407.01449/2407.01449.pdf']


In [5]:
# Parse PDFs and convert to images
def parse_pdf(pdf_path):
    images = []
    metadata = {}
    
    # Extract metadata
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        metadata = reader.metadata
        num_pages = len(reader.pages)
    
    # Convert pages to images
    pdf_images = convert_from_bytes(open(pdf_path, 'rb').read())
    
    pdf_dir = os.path.dirname(pdf_path)
    for i, img in enumerate(pdf_images):
        img_path = os.path.join(pdf_dir, f"page_{i+1}.jpg")
        img.save(img_path, 'JPEG')
        images.append(img_path)
    
    return images, metadata, num_pages

parsed_pdfs = []
for pdf_path in downloaded_paths:
    images, metadata, num_pages = parse_pdf(pdf_path)
    parsed_pdfs.append({
        'pdf_path': pdf_path,
        'images': images,
        'metadata': metadata,
        'num_pages': num_pages
    })

print("Parsed PDFs:", parsed_pdfs)

# This cell defines a function to parse PDFs, extract metadata, and convert pages to images.
# It then applies this function to each downloaded PDF, saving images in the PDF's directory.

Parsed PDFs: [{'pdf_path': '/teamspace/studios/this_studio/litepali_data/2403.09611/2403.09611.pdf', 'images': ['/teamspace/studios/this_studio/litepali_data/2403.09611/page_1.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_2.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_3.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_4.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_5.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_6.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_7.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_8.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_9.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_10.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_11.jpg', '/teamspace/studios/this_studio/litepali_data/2403.09611/page_12.jpg', '/teamspace/studios/this_studio/litepali_data

In [6]:
# Initialize LitePali and add images
litepali = LitePali()

for pdf in parsed_pdfs:
    for i, img_path in enumerate(pdf['images']):
        litepali.add(ImageFile(
            path=img_path,
            document_id=os.path.basename(pdf['pdf_path']),
            page_id=i+1,
            metadata={
                'title': pdf['metadata'].get('/Title', ''),
                'author': pdf['metadata'].get('/Author', ''),
                'num_pages': pdf['num_pages']
            }
        ))

# This cell initializes LitePali and adds each image from the parsed PDFs to the index.
# The document_id is set to the PDF filename, and page_id is set to the page number.

In [7]:
# Process images and create index
litepali.process(batch_size=4)
print("Index created successfully")
litepali.index_stats()

# This cell processes the added images and creates the LitePali index.

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processed batch 1: 4/109 images
Processed batch 2: 8/109 images
Processed batch 3: 12/109 images
Processed batch 4: 16/109 images
Processed batch 5: 20/109 images
Processed batch 6: 24/109 images
Processed batch 7: 28/109 images
Processed batch 8: 32/109 images
Processed batch 9: 36/109 images
Processed batch 10: 40/109 images
Processed batch 11: 44/109 images
Processed batch 12: 48/109 images
Processed batch 13: 52/109 images
Processed batch 14: 56/109 images
Processed batch 15: 60/109 images
Processed batch 16: 64/109 images
Processed batch 17: 68/109 images
Processed batch 18: 72/109 images
Processed batch 19: 76/109 images
Processed batch 20: 80/109 images
Processed batch 21: 84/109 images
Processed batch 22: 88/109 images
Processed batch 23: 92/109 images
Processed batch 24: 96/109 images
Processed batch 25: 100/109 images
Processed batch 26: 104/109 images
Processed batch 27: 108/109 images
Processed batch 28: 109/109 images
Finished processing. Total images processed: 109
Index 

{'total_images': 109,
 'processed_images': 109,
 'unique_documents': 3,
 'image_extensions': ['.jpg']}

In [8]:
# Save the index
index_path = os.path.join(base_dir, "litepali_index")
litepali.save_index(index_path)
print(f"Index saved to {index_path}")

# This cell saves the created index to a file in the base directory for later use.

Index saved to /teamspace/studios/this_studio/litepali_data/litepali_index
Index saved to /teamspace/studios/this_studio/litepali_data/litepali_index


In [9]:
# Load the index (simulating a new session)
new_litepali = LitePali()
new_litepali.load_index(index_path)
print("Index loaded successfully")

# This cell demonstrates how to load a previously saved index.

Index loaded from /teamspace/studios/this_studio/litepali_data/litepali_index
Index loaded successfully


In [18]:
# Perform searches
queries = [
    "What is ColPali?",
    "Explain the concept of vision language models",
    "How does MM1 compare to other multimodal models?",
    "What is CLIP and where I can use it?"
]

for query in queries:
    print(f"\nQuery: {query}")
    results = new_litepali.search(query, k=3)
    for result in results:
        print(f"Document: {result['image'].document_id}, Page: {result['image'].page_id}, Score: {result['score']}")

# This cell performs searches using sample queries and prints the results.


Query: What is ColPali?
Document: 2407.01449.pdf, Page: 1, Score: 14.75
Document: 2407.01449.pdf, Page: 6, Score: 14.125
Document: 2407.01449.pdf, Page: 2, Score: 13.5625

Query: Explain the concept of vision language models
Document: 2403.09611.pdf, Page: 17, Score: 14.75
Document: 2407.01449.pdf, Page: 1, Score: 14.5625
Document: 2407.01449.pdf, Page: 3, Score: 14.375

Query: How does MM1 compare to other multimodal models?
Document: 2403.09611.pdf, Page: 15, Score: 15.8125
Document: 2403.09611.pdf, Page: 1, Score: 15.75
Document: 2403.09611.pdf, Page: 17, Score: 15.5625

Query: What is CLIP and where I can use it?
Document: 2103.00020.pdf, Page: 20, Score: 17.375
Document: 2103.00020.pdf, Page: 27, Score: 17.125
Document: 2407.01449.pdf, Page: 11, Score: 17.0


In [19]:
# FYI Table for PDF Information

from IPython.display import Markdown

table = """
| PDF path | Title |
|----------|-------|
| 2403.09611.pdf | MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training |
| 2103.00020.pdf | Learning Transferable Visual Models From Natural Language Supervision |
| 2407.01449.pdf | ColPali: Efficient Document Retrieval with Vision Language Models |
"""

display(Markdown(table))

# This cell creates and displays a Markdown table with PDF paths and titles for reference.


| PDF Path | Title |
|----------|-------|
| 2403.09611.pdf | MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training |
| 2103.00020.pdf | Learning Transferable Visual Models From Natural Language Supervision |
| 2407.01449.pdf | ColPali: Efficient Document Retrieval with Vision Language Models |
