In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env
load_dotenv()

import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['LANGSMITH_PROJECT'] = os.getenv("LANGSMITH_PROJECT")

## RAG pipeline starts

### Load the document first

In [3]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./dldg_databricks.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
docs[0]

Document(metadata={'producer': 'Antenna House PDF Output Library 7.1.1639', 'creator': 'AH CSS Formatter V7.1 MR2 for Linux64 : 7.1.3.50324 (2021-04-26T09:47+09)', 'creationdate': '2024-10-28T14:17:09+00:00', 'author': 'Denny Lee, Tristen Wentling, Scott Haines, and Prashanth Babu', 'moddate': '2024-10-28T10:38:19-04:00', 'title': 'Delta Lake: The Definitive Guide', 'source': './dldg_databricks.pdf', 'total_pages': 382, 'page': 0, 'page_label': ''}, page_content='Delta Lake \n  The Definitive Guide\nModern Data Lakehouse Architectures  \nwith Data Lakes\nDenny Lee, Tristen Wentling,  \nScott Haines & Prashanth Babu\nForewords by Michael Armbrust \n& Dominique Brezinski\nCompliments of')

#### It's the first step is to break the dockuments into smaller chunks as to minimize the token size and the limit of embedding models

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

##### This step is to calcualte if their is any token size of chunks is greater that the 500

In [5]:
# show the number of tokens for the chunks
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

max_token = 0

for i, doc in enumerate(splits):
    page_text = doc.page_content
    token_count = num_tokens_from_string(page_text, 'cl100k_base')
    max_token = max(token_count, max_token)
    # print(f"Page {i+1}: {token_count} tokens")
    
print(f'max-token: {max_token}')

max-token: 404


#### Create the embedding layer

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

# Load embeddings model (fits in your 4060 VRAM)
embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")


In [7]:
from langchain.vectorstores import Chroma

# 4. Create Chroma vectorstore
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    collection_name="databricks_docs",  # optional name
    persist_directory="./chroma_db"     # optional, for local persistence
)

In [13]:
# 5. Semantic search function
def semantic_search(query: str, k: int = 3):
    results = vectorstore.similarity_search(query, k=k)
    for i, r in enumerate(results, 1):
        print(f"\n--- Result {i} ---")
        print(r.page_content[:10000])  # show first 500 chars
    return results

# 6. Example query
query = "What are the key features of Delta lakes?"
semantic_search(query)


--- Result 1 ---
Key Features
Delta Lake comprises the following key features that are fundamental to an
open lakehouse format (please see the VLDB research article “Delta Lake: High-
Performance ACID Table Storage over Cloud Object Stores”  for a deeper dive into
these features):
ACID transactions
Delta Lake ensures that data modifications are performed atomically, consis‐
tently, in isolation, and durably, i.e., with ACID transaction protections. This
means that when multiple concurrent clients or tasks access the data, the system
maintains data integrity. For instance, if a process fails during a data modifi‐
cation, Delta Lake will roll back the changes, ensuring that the data remains
consistent.
Scalable metadata
The metadata of a Delta Lake table is the transaction log, which provides transac‐
tional consistency per the aforementioned ACID transactions. With a petabyte-
scale table, the table’s metadata can itself be exceedingly complicated to maintain.

--- Result 2 ---
Key Fea

[Document(metadata={'page': 33, 'title': 'Delta Lake: The Definitive Guide', 'creationdate': '2024-10-28T14:17:09+00:00', 'creator': 'AH CSS Formatter V7.1 MR2 for Linux64 : 7.1.3.50324 (2021-04-26T09:47+09)', 'source': './dldg_databricks.pdf', 'producer': 'Antenna House PDF Output Library 7.1.1639', 'moddate': '2024-10-28T10:38:19-04:00', 'total_pages': 382, 'author': 'Denny Lee, Tristen Wentling, Scott Haines, and Prashanth Babu', 'page_label': '8'}, page_content='Key Features\nDelta Lake comprises the following key features that are fundamental to an\nopen lakehouse format (please see the VLDB research article “Delta Lake: High-\nPerformance ACID Table Storage over Cloud Object Stores”  for a deeper dive into\nthese features):\nACID transactions\nDelta Lake ensures that data modifications are performed atomically, consis‐\ntently, in isolation, and durably, i.e., with ACID transaction protections. This\nmeans that when multiple concurrent clients or tasks access the data, the system

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3}  # top 3 most similar chunks
)

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp


prompt_template = """
Use the following context to answer the question.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nUse the following context to answer the question.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:\n')

In [14]:
from langchain.llms import Ollama

# Create a LangChain LLM using Ollama Docker API
llm = Ollama(
    model="gemma2:2b",         # model inside your Ollama container
    base_url="http://localhost:11434",  # Docker port exposed
)

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama
from langsmith import traceable
from langchain import hub


prompt_hub_rag = hub.pull("rlm/rag-prompt")

# 4️⃣ Runnable RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt                          # format prompt with context + question
    | llm                             # generate output using LLaMA
    | StrOutputParser()               # parse output to string
)


answer = rag_chain.invoke("What is Delta Lake")

In [16]:
answer

"Delta Lake is an open-source storage layer that supports ACID transactions, scalable metadata handling, and unification of streaming and batch data processing. \n\nHere's a breakdown from the provided text:\n\n* **ACID Transactions:** Ensures the integrity and reliability of your data through consistency, atomicity, isolation, and durability.\n* **Scalable Metadata Handling:**  Handles large amounts of metadata effectively for your data lake workloads.\n* **Unification of Streaming and Batch Data Processing:** Allows seamless integration of real-time (streaming) and batch processing operations into a single data platform.\n\nEssentially, Delta Lake provides a robust foundation for managing and analyzing your data in a variety of use cases. It's designed to be flexible, allowing you to adapt it to different types of workloads (small, medium, big data).\n"