<a href="https://colab.research.google.com/github/rubenapf/AI-for-Developers/blob/main/202509_practical_ai_development_w3_d2_start.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG - v2

- Same data as v1 (Bitcoin whitepaper)
- LLM answer generation using LangChain chain
- Gradio ChatInterface
- User experience (natural language answers) using also an output parser

## Install Dependencies

In [1]:
%pip install "langchain==0.3.27" -qqq
%pip install "langchain-community==0.3.31" -qqq
%pip install "langchain-openai==0.3.35" -qqq
%pip install "langchain-chroma==0.2.6" -qqq
%pip install pypdf -qqq


#--------------------------------------------------------------------------------
# v2 - added gradio
#--------------------------------------------------------------------------------
%pip install gradhhjbbbbbbbbio -qqq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.2/457.2 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langgraph-prebuilt 1.0.5 requires langchain-core>=1.0.0, but you have langchain-core 0.3.81 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages t

## Configuration

In [2]:
import os
from google.colab import userdata

# OpenAI API key
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["CHROMA_API_KEY"] = userdata.get("CHROMA_API_KEY")
os.environ["CHROMA_TENANT"] = userdata.get("CHROMA_TENANT")

# Version Management
VERSION = "v2"
# NOTE: We re-ingest to keep versions clean and independent,
#       but technically v2 could use v1's data since structure is identical

# Vector Database collection name
COLLECTION_NAME = f"bitcoin_docs_{VERSION}"


## Global Variables

In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma

# Embeddings Model
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

#--------------------------------------------------------------------------------
# v2 - added ChatOpenAI
#--------------------------------------------------------------------------------
# LLM
llm = ChatOpenAI(
    model="gpt-4o-mini"
)

# Vector Database
vectorstore = Chroma(
  embedding_function=embeddings,
  collection_name=COLLECTION_NAME,  # Version-based naming
  chroma_cloud_api_key=os.getenv("CHROMA_API_KEY"),
  tenant=os.getenv("CHROMA_TENANT"),
  database="code_for_all_rag"
)

## Ingestion

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def ingest_documents():
    """
    INGESTION PIPELINE - VERSION v2

    DATA: Identical to v1
    WHY RE-INGEST? Keep versions independent for teaching clarity and comparison
    """

    print("-" * 80)
    print(f"STARTING INGESTION PIPELINE - VERSION {VERSION}")
    print("-" * 80)

    #--------------------------------------------------------------------------------
    # STEP 1: LOAD DOCUMENTS
    #--------------------------------------------------------------------------------
    print("\n[1/3] Loading file from URL...")

    loader = PyPDFLoader("https://bitcoin.org/bitcoin.pdf")
    documents = loader.load()

    print(f"✓ Loaded {len(documents)} pages from file")

    #--------------------------------------------------------------------------------
    # STEP 2: CHUNK DOCUMENTS
    #--------------------------------------------------------------------------------
    print(f"\n[2/3] Chunking documents...")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    chunks = text_splitter.split_documents(documents)

    print(f"✓ Split into {len(chunks)} chunks")

    #--------------------------------------------------------------------------------
    # STEP 3: CREATE EMBEDDINGS AND STORE IN CHROMA
    #--------------------------------------------------------------------------------

    print(f"\n[3/3] Creating embeddings and storing in Chroma...")
    print(f"  Collection name: {COLLECTION_NAME}")

    vectorstore.add_documents(
        documents=chunks,
        ids=[str(i) for i in range(len(chunks))]
    )

    print(f"✓ Embeddings created and stored")




### Test Ingestion

In [5]:
ingest_documents()

--------------------------------------------------------------------------------
STARTING INGESTION PIPELINE - VERSION v2
--------------------------------------------------------------------------------

[1/3] Loading file from URL...
✓ Loaded 9 pages from file

[2/3] Chunking documents...
✓ Split into 30 chunks

[3/3] Creating embeddings and storing in Chroma...
  Collection name: bitcoin_docs_v2
✓ Embeddings created and stored


## Inference

In [23]:
query = "What is Bitcoin?"

res = inference(query)

print(res)

RUNNING INFERENCE - VERSION v2

[1/5] Performing similarity search...
  Query: 'What is Bitcoin?'
✓ Found 3 relevant chunks

[2/5] Formatting context for LLM...

First 1500 chars: Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of  electronic cash would allow online  
payments to be sent directly from one party to another without going through a  
financial institution.  Digital signatures provide part of the solution, but the main  
benefits are lost if a trusted third party is still required to prevent double-spending.  
We propose a solution to the double-spending problem using a peer-to-peer network. 
The network timestamps transactions by hashing them into an ongoing chain of  
hash-based proof-of-work, forming a record that cannot be changed without redoing  
the proof-of-work.  The longest chain not only serves as proof of the sequence of  
events witnessed, but proof that it came from the 

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def inference(query: str) -> str:
    """
    INFERENCE PIPELINE - VERSION v2

    WHAT CHANGED FROM v1?
    - Return type: List[Document] → str (natural language answer)
    - Added: Context formatting
    - Added: Prompt engineering
    - Added: LLM generation

    THE RAG FLOW:
    1. Retrieve: Get relevant documents (same as v1)
    2. Format: Combine documents into context string
    3. Prompt: Create structured instruction for LLM
    4. Chain: prompt -> llm -> output parser
    5. Generate: LLM produces natural language answer
    6. Return: User gets readable answer

    Args:
        query (str): User's question

    Returns:
        str: Natural language answer (v1 returned List[Document])
    """

    print("="*80)
    print(f"RUNNING INFERENCE - VERSION {VERSION}")
    print("="*80)

    #--------------------------------------------------------------------------------
    # STEP 1: SIMILARITY SEARCH
    #--------------------------------------------------------------------------------

    print(f"\n[1/5] Performing similarity search...")
    print(f"  Query: '{query}'")

    results = vectorstore.similarity_search(query, k=3)

    print(f"✓ Found {len(results)} relevant chunks")

    #--------------------------------------------------------------------------------
    # STEP 2: FORMAT CONTEXT (NEW in v2)
    #--------------------------------------------------------------------------------

    print(f"\n[2/5] Formatting context for LLM...")

    context = "\n\n".join([doc.page_content for doc in results])


    print(f"\nFirst 1500 chars: {context[:1500]}...")
    print(f"✓ Context formatted ({len(context)} characters)")

    #--------------------------------------------------------------------------------
    # STEP 3: PROMPT TEMPLATE (NEW in v2)
    #--------------------------------------------------------------------------------
    # ChatPromptTemplate:
    # - Defines structure with variables in {curly braces}
    # - Variables are filled when chain is invoked
    # - Reusable across all queries

    print("\n[3/5] Creating prompt template...")
    print("  Variables: {context}, {query}")

    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", "Based on the following context, answer the question clearly and concisely (max 2 paragraphs)."),
            ("human", "Documents:\n{context}\n\nQuestion:\n{query}\n\nAnswer:")
        ]
    )

    print("✓ Prompt template created")
    print("  This template will be reused for every query")
    print("  Variables will be filled automatically by the chain")

    #--------------------------------------------------------------------------------
    # STEP 4: COMPOSE CHAIN (NEW in v2)
    #--------------------------------------------------------------------------------
    # The pipe (|) operator connects components (output from the last is the input of the next)
    # Read left to right: prompt → llm → parser

    # StrOutputParser:
    # - LLMs return AIMessage objects (complex)
    # - Parser extracts just the string content
    # - Clean string output for users

    print("\n[4/5] Composing chain...")

    chain = prompt_template | llm | StrOutputParser()

    print("\n✓ Chain composed!")
    print("\n  Chain structure:")
    print("  prompt_template  (formats variables) -> llm (generates response) -> output_parser (extracts string)")
    print("  Returns: String (natural language answer)")

    #--------------------------------------------------------------------------------
    # STEP 5: GENERATE ANSWER BY INVOKING CHAIN (NEW in v2)
    #--------------------------------------------------------------------------------
    # One line replaces multiple steps of manual prompting

    print(f"\n[5/5] Invoking RAG chain...")
    print("\n  Invoking chain with context and query...")
    print("  The chain will:")
    print("    1. Format the prompt template")
    print("    2. Send to LLM")
    print("    3. Parse response to string")
    print("    4. Return answer")

    # Pass variables as dictionary to the chain

    response = chain.invoke({
        "context": context,
        "query": query
    })

    print(f"\n✓ Answer generated ({len(response)} characters)")

    print("\n" + "="*80)
    print("INFERENCE COMPLETE")
    print("="*80)

    return response  # Returns string (v1 returned List[Document])

*italicised text*### Test Inference

## Gradio Demo

In [None]:
def chat_inference(message, history):
    """
    Gradio ChatInterface wrapper

    Args:
        message (str): Current user message
        history (list): Chat history (we won't be using it now)

    Returns:
        str: Bot response
    """
    return inference(message)

In [None]:
import gradio as gr

demo = gr.ChatInterface(
    fn=chat_inference,
    type="messages",
    title="Crypto RAG Assistant (v2)",
    description="Ask questions about crypto.",
    examples=[
        "What is Bitcoin?",
        "How does mining work?",
        "What is proof of work?",
        "Explain the blockchain structure",
        "How are transactions verified?",
        "What is the double-spending problem?"
    ],
)

demo.launch(share=True, debug=True)