# RAG with Elasticsearch and watsonx.ai

## Overview
This notebook demonstrates how to build a production-grade RAG system using:
- **Elasticsearch** as the vector database
- **watsonx.ai** for embeddings and LLM generation
- **IBM Granite** models for enterprise AI

## Prerequisites
- Elasticsearch instance running
- watsonx.ai API credentials
- Python packages: elasticsearch, ibm-watsonx-ai, langchain

## Setup

In [None]:
from elasticsearch import Elasticsearch
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.embeddings import Embeddings
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from dotenv import load_dotenv
import os
import numpy as np

load_dotenv()

# Configuration
CONFIG = {
    "elasticsearch_url": os.getenv("ELASTICSEARCH_URL", "http://localhost:9200"),
    "elasticsearch_user": os.getenv("ELASTICSEARCH_USER"),
    "elasticsearch_password": os.getenv("ELASTICSEARCH_PASSWORD"),
    "watsonx_url": os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com"),
    "watsonx_apikey": os.getenv("WATSONX_APIKEY"),
    "watsonx_project_id": os.getenv("WATSONX_PROJECT_ID"),
    "index_name": "rag-demo",
    "embedding_model": "ibm/slate-30m-english-rtrvr",
    "llm_model": "ibm/granite-13b-chat-v2",
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "top_k": 5
}

print("✅ Configuration loaded")

## Step 1: Initialize Elasticsearch

In [None]:
# Connect to Elasticsearch
if CONFIG["elasticsearch_user"] and CONFIG["elasticsearch_password"]:
    es = Elasticsearch(
        [CONFIG["elasticsearch_url"]],
        basic_auth=(CONFIG["elasticsearch_user"], CONFIG["elasticsearch_password"])
    )
else:
    es = Elasticsearch([CONFIG["elasticsearch_url"]])

# Test connection
if es.ping():
    print("✅ Connected to Elasticsearch")
    print(f"Cluster info: {es.info()['version']['number']}")
else:
    print("❌ Failed to connect to Elasticsearch")

## Step 2: Initialize watsonx.ai

In [None]:
# Setup credentials
credentials = Credentials(
    url=CONFIG["watsonx_url"],
    api_key=CONFIG["watsonx_apikey"]
)

project_id = CONFIG["watsonx_project_id"]

# Initialize embedding model
embedding_model = Embeddings(
    model_id=CONFIG["embedding_model"],
    credentials=credentials,
    project_id=project_id
)

# Initialize LLM
llm_params = {
    GenParams.MAX_NEW_TOKENS: 300,
    GenParams.TEMPERATURE: 0.7,
    GenParams.TOP_P: 1,
    GenParams.TOP_K: 50
}

llm = ModelInference(
    model_id=CONFIG["llm_model"],
    credentials=credentials,
    project_id=project_id,
    params=llm_params
)

print("✅ watsonx.ai initialized")
print(f"Embedding model: {CONFIG['embedding_model']}")
print(f"LLM: {CONFIG['llm_model']}")

## Step 3: Prepare Documents

In [None]:
# Sample documents
documents = [
    Document(
        page_content="""IBM watsonx.ai is an enterprise AI platform that provides access to 
        foundation models including IBM Granite. It offers tools for prompt engineering, 
        model tuning, and deployment.""",
        metadata={"source": "watsonx.txt", "category": "platform"}
    ),
    Document(
        page_content="""Elasticsearch is a distributed search and analytics engine built on 
        Apache Lucene. It supports vector search capabilities for semantic retrieval.""",
        metadata={"source": "elasticsearch.txt", "category": "database"}
    ),
    Document(
        page_content="""RAG (Retrieval Augmented Generation) enhances LLM outputs by retrieving 
        relevant context from external knowledge bases before generation.""",
        metadata={"source": "rag.txt", "category": "technique"}
    )
]

# Chunk documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG["chunk_size"],
    chunk_overlap=CONFIG["chunk_overlap"]
)
chunks = splitter.split_documents(documents)

print(f"✅ Prepared {len(chunks)} chunks from {len(documents)} documents")

## Step 4: Create Elasticsearch Index

In [None]:
# Define index mapping with vector field
index_mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,  # Dimension for slate-30m model
                "index": True,
                "similarity": "cosine"
            },
            "metadata": {"type": "object"}
        }
    }
}

# Create or recreate index
if es.indices.exists(index=CONFIG["index_name"]):
    es.indices.delete(index=CONFIG["index_name"])
    print(f"Deleted existing index: {CONFIG['index_name']}")

es.indices.create(index=CONFIG["index_name"], body=index_mapping)
print(f"✅ Created index: {CONFIG['index_name']}")

## Step 5: Index Documents with Embeddings

In [None]:
# Generate embeddings and index documents
for i, chunk in enumerate(chunks):
    # Generate embedding using watsonx
    embedding = embedding_model.embed_query(chunk.page_content)
    
    # Index document
    doc = {
        "text": chunk.page_content,
        "embedding": embedding,
        "metadata": chunk.metadata
    }
    
    es.index(index=CONFIG["index_name"], id=i, document=doc)
    print(f"Indexed chunk {i+1}/{len(chunks)}")

# Refresh index
es.indices.refresh(index=CONFIG["index_name"])
print(f"\n✅ Indexed {len(chunks)} documents with embeddings")

## Step 6: Implement Retrieval

In [None]:
def retrieve_documents(query: str, top_k: int = 5):
    """Retrieve relevant documents using vector search."""
    # Generate query embedding
    query_embedding = embedding_model.embed_query(query)
    
    # Vector search
    search_query = {
        "knn": {
            "field": "embedding",
            "query_vector": query_embedding,
            "k": top_k,
            "num_candidates": 100
        },
        "_source": ["text", "metadata"]
    }
    
    response = es.search(index=CONFIG["index_name"], body=search_query)
    
    # Extract results
    documents = []
    for hit in response['hits']['hits']:
        documents.append({
            "text": hit['_source']['text'],
            "metadata": hit['_source']['metadata'],
            "score": hit['_score']
        })
    
    return documents

# Test retrieval
test_query = "What is watsonx.ai?"
results = retrieve_documents(test_query, top_k=3)

print(f"Query: {test_query}\n")
for i, doc in enumerate(results):
    print(f"[{i+1}] Score: {doc['score']:.4f}")
    print(f"Text: {doc['text'][:100]}...\n")

## Step 7: Implement RAG Pipeline

In [None]:
def answer_question(question: str, top_k: int = 5):
    """Answer a question using RAG."""
    # Retrieve relevant documents
    documents = retrieve_documents(question, top_k)
    
    # Construct context
    context = "\n\n".join([doc['text'] for doc in documents])
    
    # Create prompt using Granite format
    prompt = f"""<|system|>
You are a helpful AI assistant. Use the provided context to answer questions accurately.
If the context doesn't contain the answer, say so.
<|endofsystem|>

<|user|>
Context:
{context}

Question: {question}
<|endofuser|>

<|assistant|>
"""
    
    # Generate answer
    answer = llm.generate_text(prompt=prompt)
    
    return {
        "question": question,
        "answer": answer.strip(),
        "sources": [doc['metadata'] for doc in documents],
        "num_sources": len(documents)
    }

## Step 8: Test RAG System

In [None]:
# Test questions
questions = [
    "What is watsonx.ai?",
    "What is Elasticsearch used for?",
    "How does RAG work?"
]

for question in questions:
    result = answer_question(question, top_k=CONFIG["top_k"])
    
    print("="*80)
    print(f"Question: {result['question']}")
    print(f"\nAnswer: {result['answer']}")
    print(f"\nSources ({result['num_sources']}):")
    for source in result['sources']:
        print(f"  - {source.get('source', 'unknown')}")
    print()

## Key Takeaways

- ✅ Enterprise-grade RAG with Elasticsearch and watsonx.ai
- ✅ Vector search with cosine similarity
- ✅ IBM Granite models for generation
- ✅ Scalable architecture for production deployment

## Next Steps

1. Add more documents to the index
2. Implement hybrid search (keyword + semantic)
3. Add re-ranking for better results
4. Integrate with production monitoring
5. Add evaluation metrics