In [2]:
# RAG with LlamaStack - Complete Setup and Connection Test
import os
import sys
import uuid

# Set environment variable directly in notebook
os.environ['LLAMA_STACK_PORT'] = '8321'

# Check Python environment for debugging
print(f"Python executable: {sys.executable}")
print(f"Environment correct: {'/Users/dekelly/AI-Goal/.venv' in sys.executable}")

try:
    from llama_stack_client import LlamaStackClient
    print("✅ llama_stack_client imported successfully!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Trying to install in current kernel...")
    
    # Install packages in current kernel if missing
    import subprocess
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-stack-client", "fire", "requests"])
        from llama_stack_client import LlamaStackClient
        print("✅ Successfully installed and imported!")
    except Exception as install_error:
        print(f"❌ Installation failed: {install_error}")
        print("Please make sure you're using the 'AI Goal RAG Environment' kernel!")
        raise

# Create client
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
print(f"✅ Client created for http://localhost:{os.environ['LLAMA_STACK_PORT']}")

try:
    # Test connection
    models = client.models.list()
    print(f"✅ Connected! Found {len(models)} models:")
    for model in models:
        print(f"  - {model.model_type}: {model.identifier}")
    
    # Store variables globally for other cells
    globals()['client'] = client
    globals()['uuid'] = uuid
    globals()['LlamaStackClient'] = LlamaStackClient
    
    print("\n🎉 Setup complete! All variables stored for other cells.")
        
except Exception as e:
    print(f"❌ Connection Error: {e}")
    print("Make sure LlamaStack server is running on port 8321")
    raise


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"


Python executable: /Users/dekelly/AI-Goal/.venv/bin/python3
Environment correct: True
✅ llama_stack_client imported successfully!
✅ Client created for http://localhost:8321
✅ Connected! Found 2 models:
  - llm: llama3.2:3b
  - embedding: all-MiniLM-L6-v2

🎉 Setup complete! All variables stored for other cells.


In [3]:
# Register a vector database with unique ID
vector_db_id = f"my_documents_{uuid.uuid4().hex[:8]}"
print(f"📊 Creating vector database: {vector_db_id}")

try:
    response = client.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model="all-MiniLM-L6-v2",
        embedding_dimension=384,
        provider_id="faiss",
    )
    print("✅ Vector DB registered successfully!")
    print(f"Response: {response}")
    
except Exception as e:
    print(f"❌ Vector DB Registration Error: {e}")
    raise


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"


📊 Creating vector database: my_documents_28fd79ac
✅ Vector DB registered successfully!
Response: VectorDBRegisterResponse(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='my_documents_28fd79ac', provider_id='faiss', type='vector_db', provider_resource_id='my_documents_28fd79ac', owner={'principal': '', 'attributes': {}})


In [4]:
# Insert sample documents about AI and RAG
chunks = [
    {
        "content": "Retrieval-Augmented Generation (RAG) is a technique that combines information retrieval with text generation. It allows language models to access external knowledge sources to provide more accurate and up-to-date responses.",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "rag_intro",
            "topic": "RAG basics",
        },
    },
    {
        "content": "LlamaStack is an open-source platform that provides standardized APIs for building AI applications. It supports various providers for inference, vector storage, and other AI capabilities.",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "llamastack_info",
            "topic": "LlamaStack overview",
        },
    },
    {
        "content": "Vector databases store high-dimensional embeddings that represent the semantic meaning of text. This enables semantic search and retrieval based on meaning rather than exact keyword matches.",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "vector_db_info",
            "topic": "Vector databases",
        },
    },
]

print(f"📝 Inserting {len(chunks)} document chunks...")
try:
    client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
    print("✅ Document chunks inserted successfully!")
except Exception as e:
    print(f"❌ Document Insertion Error: {e}")
    raise


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/insert "HTTP/1.1 200 OK"


📝 Inserting 3 document chunks...
✅ Document chunks inserted successfully!


In [5]:
# Test queries
test_queries = [
    "What is RAG?",
    "Tell me about LlamaStack",  
    "How do vector databases work?"
]

print("🔍 Testing RAG queries:")
for query in test_queries:
    print(f"\nQuery: '{query}'")
    try:
        chunks_response = client.vector_io.query(
            vector_db_id=vector_db_id, 
            query=query
        )
        print(f"Found {len(chunks_response.chunks)} relevant chunks:")
        for i, chunk in enumerate(chunks_response.chunks, 1):
            # Handle different score formats
            score = getattr(chunk, 'score', 0.0)
            print(f"  {i}. Score: {score:.3f}")
            print(f"     Content: {chunk.content[:100]}...")
            if hasattr(chunk, 'metadata') and chunk.metadata:
                print(f"     Topic: {chunk.metadata.get('topic', 'Unknown')}")
    except Exception as e:
        print(f"❌ Query Error: {e}")


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/query "HTTP/1.1 200 OK"


🔍 Testing RAG queries:

Query: 'What is RAG?'
Found 3 relevant chunks:
  1. Score: 0.000
     Content: Retrieval-Augmented Generation (RAG) is a technique that combines information retrieval with text ge...
     Topic: RAG basics
  2. Score: 0.000
     Content: LlamaStack is an open-source platform that provides standardized APIs for building AI applications. ...
     Topic: LlamaStack overview
  3. Score: 0.000
     Content: Vector databases store high-dimensional embeddings that represent the semantic meaning of text. This...
     Topic: Vector databases

Query: 'Tell me about LlamaStack'
Found 3 relevant chunks:
  1. Score: 0.000
     Content: LlamaStack is an open-source platform that provides standardized APIs for building AI applications. ...
     Topic: LlamaStack overview
  2. Score: 0.000
     Content: Retrieval-Augmented Generation (RAG) is a technique that combines information retrieval with text ge...
     Topic: RAG basics
  3. Score: 0.000
     Content: Vector databases s

In [6]:
# Optional: Create RAG Agent for Interactive Q&A
print("🤖 Creating RAG Agent for interactive Q&A...")

try:
    from llama_stack_client import Agent
    
    # Get the LLM model
    models = client.models.list()
    llm_model = next(m for m in models if m.model_type == "llm")
    
    # Create an agent with RAG capabilities
    agent = Agent(
        client,
        model=llm_model.identifier,
        instructions="You are a helpful AI assistant with access to knowledge about RAG, LlamaStack, and vector databases. Use the search tool to find relevant information before answering questions.",
        tools=[
            {
                "name": "builtin::rag/knowledge_search",
                "args": {"vector_db_ids": [vector_db_id]}
            }
        ]
    )
    
    print(f"✅ RAG Agent created with model: {llm_model.identifier}")
    
    # Create a session
    session_id = agent.create_session("rag_demo_session")
    print(f"✅ Session created: {session_id}")
    
    # Test the agent
    print("\\n🧪 Testing agent with a question...")
    question = "What are the main benefits of using RAG in AI applications?"
    print(f"Question: {question}")
    
    response = agent.create_turn(
        messages=[{"role": "user", "content": question}],
        session_id=session_id,
        stream=False
    )
    
    print(f"\\n🤖 Agent Response:\\n{response.output_message.content}")
    
    # Store agent for further use
    globals()['agent'] = agent
    globals()['session_id'] = session_id
    print("\\n✅ Agent stored globally - you can ask more questions!")
    
except Exception as e:
    print(f"❌ Agent creation failed: {e}")
    print("Don't worry - basic RAG functionality still works perfectly!")
    print("This is just an optional advanced feature.")


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/79e598e6-f0dd-463c-99b4-18f5472b75ab/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/79e598e6-f0dd-463c-99b4-18f5472b75ab/session/ea5d1fc0-0fa8-4988-9870-3d9a343b6d9c/turn "HTTP/1.1 200 OK"


🤖 Creating RAG Agent for interactive Q&A...
✅ RAG Agent created with model: llama3.2:3b
✅ Session created: ea5d1fc0-0fa8-4988-9870-3d9a343b6d9c
\n🧪 Testing agent with a question...
Question: What are the main benefits of using RAG in AI applications?
\n🤖 Agent Response:\nRAG (Retrieval-Augmented Generation) has several main benefits in AI applications:

1. **Improved accuracy**: By accessing external knowledge sources, RAG models can provide more accurate and up-to-date responses to user queries.
2. **Enhanced contextual understanding**: RAG allows language models to understand the context of a query and retrieve relevant information from external sources, leading to more informed and precise responses.
3. **Increased efficiency**: RAG can reduce the need for manual knowledge graph construction and maintenance, making it easier to integrate with existing applications and systems.
4. **Scalability**: RAG can handle large volumes of data and scale to meet the needs of complex AI applicat

In [7]:
# Clean up
print(f"🧹 Cleaning up vector database...")
try:
    client.vector_dbs.unregister(vector_db_id)
    print("✅ Cleanup completed!")
    print("\n🎉 RAG system is working perfectly!")
except Exception as e:
    print(f"❌ Cleanup Error: {e}")
    print("You may need to manually clean up the vector database")


INFO:httpx:HTTP Request: DELETE http://localhost:8321/v1/vector-dbs/my_documents_28fd79ac "HTTP/1.1 200 OK"


🧹 Cleaning up vector database...
✅ Cleanup completed!

🎉 RAG system is working perfectly!
