In [1]:

# Store documents in Qdrant
if all_documents:
    print(f"Creating vector store with {len(all_documents)} document chunks")
    
    # Delete collection if it exists to start fresh
    try:
        client.delete_collection(COLLECTION_NAME)
        print(f"Deleted existing collection '{COLLECTION_NAME}'")
    except Exception:
        pass  # Collection doesn't exist yet
    
    # Create texts and metadata for embedding
    texts = [doc.page_content for doc in all_documents]
    metadatas = [doc.metadata for doc in all_documents]
    
    # Use Qdrant's from_texts method
    vector_store = QdrantVectorStore.from_texts(
        texts=texts,
        embedding=embeddings,
        metadatas=metadatas,
        url=QDRANT_URL,
        collection_name=COLLECTION_NAME,
        force_recreate=True
    )
    
    print(f"Successfully created embeddings and stored them in Qdrant collection '{COLLECTION_NAME}'")
    return vector_store
else:
    print("No documents were processed.")
    return None


1

In [18]:
import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

# Load environment variables
load_dotenv()

# Constants
PDF_DIRECTORY = "/Users/ryan/workshop/langchain-agentic-rag/shared"
COLLECTION_NAME = "documents"
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 200
EMBEDDING_MODEL = "nomic-embed-text"  # Matches what was used in n8n workflow
QDRANT_URL = "http://localhost:6333"  # Default for local Qdrant

# Find all PDF files in the directory
pdf_files = glob.glob(f"{PDF_DIRECTORY}/*.pdf")

# Initialize embeddings model
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

# Initialize Qdrant client for collection management
client = QdrantClient(url=QDRANT_URL)

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

# Process each PDF file
all_documents = []

pdf_file = pdf_files[1]
# Load the PDF
loader = PyPDFLoader(pdf_file, mode="single")
documents = loader.load()

# Split the document into chunks
split_documents = text_splitter.split_documents(documents)

print(f"  - Extracted {len(split_documents)} chunks from {pdf_file}")

# Add to our collection
all_documents.extend(split_documents)
    

  - Extracted 2 chunks from /Users/ryan/workshop/langchain-agentic-rag/shared/apple-berry-crisp.pdf


In [19]:
all_documents[0].page_content

1993

In [20]:
len(all_documents[1].page_content)

949

In [21]:
documents

[Document(metadata={'producer': 'Qt 4.8.7', 'creator': 'wkhtmltopdf 0.12.6', 'creationdate': '2023-06-05T12:24:40-07:00', 'title': "Apple berry crisp - Canada's Food Guide", 'source': '/Users/ryan/workshop/langchain-agentic-rag/shared/apple-berry-crisp.pdf', 'total_pages': 7}, page_content="FR\nMENU \nCanada.ca\n \n \nHealth\n \n \nFood and nutrition\n \n \nCanada's food guide\n \n \nTips for healthy eating\n \n \nMeal planning and cooking\n \n \nRecipes\nSearch Canada.ca\n\ue003\n\ue114\n>\n>\n>\n>\n>\n>\nCanada’s\nfood\nguide\nApple berry crisp\nVegetarian\nThis irresistible recipe showcases the wonderful combination of apples and\nblueberries. Enjoy it warm on its own or with a small dollop of yogurt. Perfect for\ndessert or as a snack with a glass of water, white milk or unsweetened fortiﬁed soy\nbeverage.\nIngredients\n4 apples, cored and chopped\n500 mL (2 cups) frozen or fresh blueberries\n45 mL (3 tbsp) packed brown sugar\n30 mL (2 tbsp) all purpose ﬂour\nPrep time\n15 min\nCoo