# Document Ingestion with LlamaIndex

This notebook explores document ingestion using LlamaIndex's SimpleDirectoryReader.

In [None]:
# Install dependencies if needed
# !pip install llama-index

In [None]:
from llama_index.core import SimpleDirectoryReader
import os
import shutil

In [None]:
# Set the PDF path
pdf_path = "../docs/DocLayNet.pdf"
print(f"Processing: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")

In [None]:
# Create a temporary directory and copy the PDF
temp_dir = "llama_temp"
os.makedirs(temp_dir, exist_ok=True)
shutil.copy(pdf_path, temp_dir)
print(f"Copied PDF to {temp_dir}")

In [None]:
# Load documents
documents = SimpleDirectoryReader(temp_dir).load_data()
print(f"Loaded {len(documents)} documents")
print(f"First document type: {type(documents[0])}")
print(f"First document metadata: {documents[0].metadata}")

In [None]:
# Display text from first document
if documents:
    text = documents[0].text
    print("Text from first document:")
    print(text[:1000])
    print(f"\nTotal length: {len(text)}")

In [None]:
# Clean up
shutil.rmtree(temp_dir)
print("Cleaned up temporary directory")