<a href="https://colab.research.google.com/github/rsrini7/Colabs/blob/main/llamaindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install llama-index chromadb llama-index-vector-stores-chroma llama-index-embeddings-huggingface sentence-transformers llama-index-llms-openai litellm --quiet


In [17]:
# llamaindex_rag_openrouter_colab_litellm.py
from google.colab import userdata
import os
import logging
import sys
from typing import Optional, Any, AsyncGenerator, Generator

# --- LlamaIndex Imports ---
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
    Document
)
from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
from llama_index.core.callbacks import CallbackManager
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# --- Other Necessary Imports ---
import chromadb
import litellm

# --- Configuration & Constants ---
EMBED_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
DATA_DIR = "./data"
SAMPLE_FILE_NAME = "sample.txt"
OPENROUTER_LITELLM_MODEL_STRING = "openrouter/openai/gpt-3.5-turbo" # Or your preferred OpenRouter model
DB_PATH = './db_chroma_llamaindex_openrouter_litellm'
COLLECTION_NAME = "llamaindex_rag_openrouter_colab_litellm"

# --- Helper: Setup Logging (Optional) ---
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger('litellm').setLevel(logging.INFO) # To see LiteLLM logs

# --- Custom LLM Class using LiteLLM ---
class LiteLLMCustom(CustomLLM):
    model_string_for_litellm: str = OPENROUTER_LITELLM_MODEL_STRING
    num_output: int = 512

    _model_name_internal: str
    _actual_context_window: int

    def __init__(self,
                 model_string_for_litellm: Optional[str] = None,
                 num_output: Optional[int] = None,
                 callback_manager: Optional[CallbackManager] = None,
                 **kwargs: Any):
        init_data = {}
        if model_string_for_litellm is not None:
            init_data["model_string_for_litellm"] = model_string_for_litellm
        if num_output is not None:
            init_data["num_output"] = num_output
        if callback_manager is not None:
            init_data["callback_manager"] = callback_manager
        init_data.update(kwargs)
        super().__init__(**init_data)
        self._model_name_internal = self.model_string_for_litellm
        self._actual_context_window = self._get_model_info(self.model_string_for_litellm)

    def _get_model_info(self, model_name_param: str) -> int:
        try:
            info = litellm.get_model_info(model_name_param)
            if info and 'max_input_tokens' in info and info['max_input_tokens'] is not None:
                return int(info['max_input_tokens'])
        except Exception as e:
            print(f"Warning: Could not get model info for {model_name_param} from LiteLLM: {e}. Using fallback 4096.")
        return 4096

    @property
    def metadata(self) -> LLMMetadata:
        return LLMMetadata(
            context_window=self._actual_context_window,
            num_output=self.num_output,
            model_name=self._model_name_internal,
        )

    def _prepare_litellm_kwargs(self, **kwargs) -> dict:
        allowed_litellm_keys = {"temperature", "max_tokens", "top_p", "stop", "presence_penalty", "frequency_penalty", "seed"}
        return {key: value for key, value in kwargs.items() if key in allowed_litellm_keys}

    def complete(self, prompt: str, formatted: bool = False, **kwargs) -> CompletionResponse:
        messages = [{"role": "user", "content": prompt}]
        litellm_call_kwargs = self._prepare_litellm_kwargs(**kwargs)
        response = litellm.completion(
            model=self.model_string_for_litellm, messages=messages,
            api_key=os.getenv("OPENROUTER_API_KEY"), **litellm_call_kwargs
        )
        text_response = response.choices[0].message.content or ""
        return CompletionResponse(text=text_response, raw=response.model_dump()) # UPDATED

    async def acomplete(self, prompt: str, formatted: bool = False, **kwargs) -> CompletionResponse:
        messages = [{"role": "user", "content": prompt}]
        litellm_call_kwargs = self._prepare_litellm_kwargs(**kwargs)
        response = await litellm.acompletion(
            model=self.model_string_for_litellm, messages=messages,
            api_key=os.getenv("OPENROUTER_API_KEY"), **litellm_call_kwargs
        )
        text_response = response.choices[0].message.content or ""
        return CompletionResponse(text=text_response, raw=response.model_dump()) # UPDATED

    def stream_complete(self, prompt: str, formatted: bool = False, **kwargs) -> Generator[CompletionResponse, None, None]:
        messages = [{"role": "user", "content": prompt}]
        litellm_call_kwargs = self._prepare_litellm_kwargs(**kwargs)
        response_stream = litellm.completion(
            model=self.model_string_for_litellm, messages=messages, stream=True,
            api_key=os.getenv("OPENROUTER_API_KEY"), **litellm_call_kwargs
        )
        content_so_far = ""
        for chunk in response_stream:
            delta = ""
            if chunk.choices and chunk.choices[0].delta:
                delta = chunk.choices[0].delta.content or ""
            if delta:
                content_so_far += delta
                yield CompletionResponse(text=content_so_far, delta=delta, raw=chunk.model_dump()) # UPDATED

    async def astream_complete(self, prompt: str, formatted: bool = False, **kwargs) -> AsyncGenerator[CompletionResponse, None]:
        messages = [{"role": "user", "content": prompt}]
        litellm_call_kwargs = self._prepare_litellm_kwargs(**kwargs)
        response_stream = await litellm.acompletion(
            model=self.model_string_for_litellm, messages=messages, stream=True,
            api_key=os.getenv("OPENROUTER_API_KEY"), **litellm_call_kwargs
        )
        content_so_far = ""
        async for chunk in response_stream:
            delta = ""
            if chunk.choices and chunk.choices[0].delta:
                delta = chunk.choices[0].delta.content or ""
            if delta:
                content_so_far += delta
                yield CompletionResponse(text=content_so_far, delta=delta, raw=chunk.model_dump()) # UPDATED

# --- Main Script Logic ---
def main():
    print("--- Starting LlamaIndex RAG with OpenRouter via LiteLLM ---")
    try:
        openrouter_api_key = userdata.get('OPENROUTER_API_KEY')
        os.environ["OPENROUTER_API_KEY"] = openrouter_api_key
        print("OpenRouter API Key loaded from Colab Secrets.")
    except userdata.SecretNotFoundError:
        print("ERROR: OPENROUTER_API_KEY not found in Colab Secrets. Please add it.")
        sys.exit(1)
    except Exception as e:
        print(f"ERROR: Could not load OpenRouter API Key: {e}")
        sys.exit(1)

    sample_file_path = os.path.join(DATA_DIR, SAMPLE_FILE_NAME)
    if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR)
    if not os.path.exists(sample_file_path):
        with open(sample_file_path, "w") as f:
            f.write("""The history of AI began in antiquity, with myths and stories.
Modern AI started in the 1950s with Alan Turing.
Key developments include machine learning and deep learning.
Large Language Models (LLMs) like GPT-4 are a significant advancement.
Frameworks like LlamaIndex help build LLM applications.
OpenRouter provides access to many different LLMs.
Vector databases are essential for semantic search in RAG.
""")
        print(f"Created dummy sample file: '{sample_file_path}'")

    print(f"\nConfiguring LLM: Custom LiteLLM Wrapper with model '{OPENROUTER_LITELLM_MODEL_STRING}'")
    Settings.llm = LiteLLMCustom(model_string_for_litellm=OPENROUTER_LITELLM_MODEL_STRING)
    print(f"LLM configured. Context window: {Settings.llm.metadata.context_window}, Output size: {Settings.llm.metadata.num_output}")

    print(f"\nConfiguring Embedding Model: '{EMBED_MODEL_NAME}'")
    try:
        Settings.embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
        print("Embedding model configured successfully.")
    except Exception as e:
        print(f"ERROR: Could not load HuggingFace embedding model '{EMBED_MODEL_NAME}': {e}")
        sys.exit(1)

    print("\n--- 1. Ingesting Data ---")
    try:
        documents = SimpleDirectoryReader(DATA_DIR).load_data()
        if not documents:
            print(f"Warning: No documents loaded from '{DATA_DIR}'.")
            sys.exit(1)
        print(f"Loaded {len(documents)} document(s) from '{DATA_DIR}'.")
    except Exception as e:
        print(f"ERROR during document loading: {e}")
        sys.exit(1)

    print("\n--- 2. Storing in Vector Database (ChromaDB) ---")
    try:
        chroma_client = chromadb.PersistentClient(path=DB_PATH)
        chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        print(f"ChromaDB setup: collection '{COLLECTION_NAME}' at '{DB_PATH}'. Initial count: {chroma_collection.count()}")
        print("Building or loading index...")
        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
        print(f"Index built/loaded. Documents in Chroma collection now: {chroma_collection.count()}.")
    except Exception as e:
        print(f"ERROR during vector store or indexing setup: {e}")
        sys.exit(1)

    print("\n--- 3. Performing Explicit Search ---")
    query = "What are key developments in AI?"
    try:
        retriever = index.as_retriever(similarity_top_k=2)
        retrieved_nodes = retriever.retrieve(query)
        print(f"Search query: '{query}'")
        print(f"Found {len(retrieved_nodes)} relevant node(s):")
        for i, node_with_score in enumerate(retrieved_nodes):
            print(f"  Result {i+1} (Score: {node_with_score.score:.4f}): {node_with_score.node.get_content()[:100].strip()}...")
    except Exception as e:
        print(f"ERROR during retrieval: {e}")
        sys.exit(1)

    print("\n--- 4. Generating Answer with LLM ---")
    try:
        query_engine = index.as_query_engine(similarity_top_k=2)
        print(f"Querying LLM with: '{query}'")
        response = query_engine.query(query)
        print(f"\nLLM Answer for '{query}':")
        print(f"Answer: {response.response}")
    except Exception as e:
        print(f"ERROR during query engine execution or LLM call: {e}")

    print("\n--- LlamaIndex RAG with OpenRouter via LiteLLM Finished ---")

if __name__ == "__main__":
    main()

--- Starting LlamaIndex RAG with OpenRouter via LiteLLM ---
OpenRouter API Key loaded from Colab Secrets.

Configuring LLM: Custom LiteLLM Wrapper with model 'openrouter/openai/gpt-3.5-turbo'
LLM configured. Context window: 4096, Output size: 512

Configuring Embedding Model: 'sentence-transformers/all-MiniLM-L6-v2'
Embedding model configured successfully.

--- 1. Ingesting Data ---
Loaded 1 document(s) from './data'.

--- 2. Storing in Vector Database (ChromaDB) ---
ChromaDB setup: collection 'llamaindex_rag_openrouter_colab_litellm' at './db_chroma_llamaindex_openrouter_litellm'. Initial count: 2
Building or loading index...
Index built/loaded. Documents in Chroma collection now: 3.

--- 3. Performing Explicit Search ---
Search query: 'What are key developments in AI?'
Found 1 relevant node(s):
  Result 1 (Score: 0.3383): # sample.txt
The history of AI began in antiquity, with myths, stories and rumors of artificial bein...

--- 4. Generating Answer with LLM ---
Querying LLM with: 'W