In [29]:
import os
import pymupdf
from tqdm import tqdm

from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq


from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import base64
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [30]:
from dotenv import load_dotenv
load_dotenv()
from config import Config


# Input documents folder
DATA_FOLDER = Config.DATA_FOLDER

# API Keys
GROQ_API_KEY = Config.GROQ_API_KEY 
OPENAI_API_KEY = Config.OPENAI_API_KEY

LLM_MODEL = Config.LLM_MODEL 

# Document Processing
CHUNK_SIZE = Config.CHUNK_SIZE
CHUNK_OVERLAP = Config.CHUNK_OVERLAP

CLIP_MODEL = Config.CLIP_MODEL
TEXT_EMBEDDING = Config.TEXT_EMBEDDING



In [31]:
## Initialise LLM
# llm = ChatGroq(
#     model_name="openai/gpt-oss-120b",
#     temperature=0.7
# )

llm =  init_chat_model(
    model=LLM_MODEL,
    temperature=0
)


### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained(CLIP_MODEL)
clip_processor=CLIPProcessor.from_pretrained(CLIP_MODEL)
clip_model.eval()



CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [32]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data
    
    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [5]:
DATA_FOLDER = "data"

## Process PDF

# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM
documents = []

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)


for filename in os.listdir(DATA_FOLDER):

    if filename.endswith(".pdf"):

        pdf_path = os.path.join(DATA_FOLDER, filename)

        pdf = pymupdf.open(pdf_path)

        for i,page in enumerate(pdf):

            text = pdf[i].get_text()
            if text.strip():
                ##create temporary document for splitting
                temp_doc = Document(page_content=text, metadata={
                        "source": filename,
                        "page": i + 1,
                        "type": "text"
                    })
                text_chunks = splitter.split_documents([temp_doc])

                #Embed each chunk using CLIP
                for chunk in text_chunks:
                    embedding = embed_text(chunk.page_content)
                    all_embeddings.append(embedding)
                    all_docs.append(chunk)
        
            # Extract images
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    base_image = pdf.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    
                    # Create PIL Image from bytes
                    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                    
                    # Embed the image
                    image_embedding = embed_image(image)
                    
                    # Store the embedding and metadata
                    all_embeddings.append(image_embedding)
                    doc_metadata = {
                        "source": filename,
                        "page": i + 1,
                        "type": "image",
                        "image_index": img_index + 1
                    }
                    all_docs.append(Document(page_content="", metadata=doc_metadata))
                    
                    # Store actual image data for later retrieval
                    image_data_store[(filename, i + 1, img_index + 1)] = image

                except Exception as e:
                    print(f"Error processing image {img_index + 1} on page {i + 1} of {filename}: {e}")
        
        pdf.close()
        

In [6]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 1.06409239e-02,  6.15886040e-03,  3.80151998e-03, ...,
         4.01898697e-02, -2.52230950e-02,  5.00337966e-02],
       [ 2.34374683e-02, -1.09239388e-02, -2.35859491e-02, ...,
         8.61070305e-02,  2.78107147e-03, -1.11164972e-02],
       [ 2.62417123e-02, -1.36267962e-02, -3.89178516e-03, ...,
         6.22377880e-02, -4.83990880e-03, -7.18384981e-03],
       ...,
       [ 9.66303237e-03, -1.42385792e-02, -4.28498499e-02, ...,
        -1.60443522e-02, -4.77830553e-03, -6.92308182e-03],
       [ 2.86734793e-02, -1.84583501e-03, -1.77934859e-02, ...,
        -6.97790086e-02, -1.12668646e-03,  3.58959921e-02],
       [-4.70399391e-03, -3.68150868e-05, -1.22455275e-02, ...,
         1.53705652e-03, -1.91832520e-02, -3.49148400e-02]],
      shape=(1877, 512), dtype=float32)

In [7]:
len(all_docs), len(embeddings_array)

(1877, 1877)

In [8]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x128281e80>

In [9]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results

In [33]:
vision_cache = {}

def describe_image(image_id):

    if image_id in vision_cache:
        return vision_cache[image_id]

    image = image_data_store[image_id]

    buffered = io.BytesIO()

    image.save(buffered, format="PNG")

    base64_image = base64.b64encode(buffered.getvalue()).decode()

    response = llm.invoke([{
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this chart or image"},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"
                }
            }
        ]
    }])

    vision_cache[image_id] = response.content

    return response.content



In [34]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
            content.append({
                "type": "text",
                "text": f"\nImage Description: {describe_image(image_id)}\n"
            })
    
    return HumanMessage(content=content)

### RAG AGENT 

In [40]:
from typing import List, Optional
from pydantic import BaseModel, Field
from typing import List, Optional

class RAGReflectionState(BaseModel):
    # Required field
    question: str
    
    # Fields with explicit defaults (making them truly optional for .invoke)
    retrieved_docs: List[Document] = Field(default_factory=list) 
    rerank_retrieved_docs: List[Document] = Field(default_factory=list)
    answer: Optional[str] = None
    reflection: Optional[str] = None
    revised: bool = False
    attempts: int = 0
    is_allowed: bool = True


In [41]:
from langgraph.graph import StateGraph, END
import json


# --- NODE 1: RETRIEVER ---
def retriever_node(state: RAGReflectionState) -> RAGReflectionState:
    query = state.question 
    # Using your unified CLIP retrieval function
    docs = retrieve_multimodal(query, k=7)
    return {"retrieved_docs": docs}


## NODE 2 : ReRANKER
def rerank_documents_node(state: RAGReflectionState):
    query = state.question
    docs = state.retrieved_docs
    
    if not docs:
        return {"retrieved_docs": [], "rerank_retrieved_docs": []}

    # Format documents for the LLM
    doc_texts = "\n".join([
        f"ID: {i}\nContent: {doc.page_content[:500]}" 
        for i, doc in enumerate(docs)
    ])

    prompt = f"""
    You are an expert search ranker. Given the user query and a list of documents, 
    determine which documents are most relevant to answer the question.
    
    Query: {query}
    
    Documents:
    {doc_texts}
    
    Return only a JSON list of IDs in order of relevance, for example: [2, 0, 3].
    Only include IDs of documents that are truly relevant.
    """

    # Get ranking from LLM
    response = llm.invoke(prompt)
    try:
        # Extract ID list from response
        # Using a simple strip in case LLM adds markdown backticks
        raw_content = response.content.replace("```json", "").replace("```", "").strip()
        relevant_ids = json.loads(raw_content)
        
        # Reorder and filter original docs based on LLM decision
        rerank_retrieved_docs = [docs[idx] for idx in relevant_ids if idx < len(docs)]
    except Exception as e:
        print(f"Reranking failed, falling back to original docs: {e}")
        rerank_retrieved_docs = docs[:3] # Fallback to top 3

    return {"rerank_retrieved_docs": rerank_retrieved_docs}



# --- NODE 3: ANSWER GENERATOR ---

from langchain_core.messages import HumanMessage, SystemMessage

def generate_answer_node(state: RAGReflectionState):
    # 1. Get the list of blocks [{type: text, ...}, {type: image_url, ...}]
    multimodal_content = create_multimodal_message(state.question, state.rerank_retrieved_docs)
    
    # print("Multimodal content for LLM:", multimodal_content)

    # 2. Check: if multimodal_content is accidentally a Message object, extract content
    if hasattr(multimodal_content, 'content'):
        multimodal_content = multimodal_content.content

    # 3. Create the messages list using raw dictionaries for the user part
    # This bypasses the HumanMessage Pydantic validation error
    messages = [
        {"role": "system", "content": "You are Adobe AI Leadership Insight and Decision Agent, who understand the financial reports and data of Adobe. Answer question using provided context. Answer should be concise and to the point."},
        {"role": "user", "content": multimodal_content} 
    ]
    
    # print(messages)
    # 4. Invoke with the clean list
    response = llm.invoke(messages)
    
    return {
        "answer": response.content.strip(),
        "attempts": state.attempts + 1
    }


# --- NODE 4: REFLECTOR ---
def reflection_node(state: RAGReflectionState)  -> RAGReflectionState:
    prompt = f"""
    Reflect on the following answer and context. State "YES" if satisfactory or "NO" if not.
    Question: {state.question}
    Answer: {state.answer}
    Context: {state.rerank_retrieved_docs}
    Respond like:
    Reflection: YES or NO. 
    Reflection score: 0-10 (10 being perfect)
    Explanation: ...
    """
    
    reflection = llm.invoke(prompt).content
    is_ok = "reflection: yes" in reflection.lower()
    
    
    return {
        "reflection": reflection, 
        "revised": not is_ok
    }

# --- NODE 5: FINALIZER ---
# def finalize_node(state: RAGReflectionState)    -> RAGReflectionState:
#     # This node just acts as a cleanup/formatting step before END
#     return state


In [42]:
# In your finalize_node, handle the blocked message
def finalize_node(state: RAGReflectionState):
    if not state.is_allowed:
        return {"answer": "I am an Adobe-specific assistant. I can only answer questions related to Adobe's reports and products."}
    # ... (rest of your finalize logic)
    return state

In [43]:
def guardrail_node(state: RAGReflectionState):
    prompt = f"""
    You are a security gatekeeper for an Adobe Financial RAG agent. 
    Your task is to determine if the following question is related to Adobe (its finances, products like Photoshop/Firefly, strategy, or reports).

    Question: {state.question}

    If the question is about Adobe, respond exactly with "ALLOWED".
    If it is NOT about Adobe (e.g., asking about cooking, other companies like Apple, or general trivia), respond with "BLOCKED".
    """
    
    response = llm.invoke(prompt).content.strip().upper()
    is_allowed = "ALLOWED" in response
    
    return {"is_allowed": is_allowed}


In [44]:
## Building graph workflow 


# Conditional Logic: Loop back to retriever if "revised" is True and attempts < 2
def route_after_reflection(state: RAGReflectionState):
    if not state.revised or state.attempts >= 2:
        return "complete"
    else:
        return "retry"
    



In [46]:
#### Rebuild with Reranking Node 

builder = StateGraph(RAGReflectionState)



# Add Nodes
builder.add_node("guardrail", guardrail_node)
builder.add_node("retriever", retriever_node)
builder.add_node("reranker", rerank_documents_node) 
builder.add_node("answer_generator", generate_answer_node)
builder.add_node("reflector", reflection_node)
builder.add_node("done", finalize_node)

# # Update Flow (Edges)
# builder.set_entry_point("retriever")



# Define the gatekeeping flow
builder.set_entry_point("guardrail")

builder.add_conditional_edges(
    "guardrail",
    lambda s: "continue" if s.is_allowed else "stop",
    {
        "continue": "retriever",
        "stop": "done" # Skip directly to finalize
    }
)
builder.add_edge("retriever", "reranker")        
builder.add_edge("reranker", "answer_generator")
builder.add_edge("answer_generator", "reflector")

# Update Conditional Logic
builder.add_conditional_edges(
    "reflector",
    route_after_reflection,
    {
        "complete": "done",
        "retry": "retriever" # Retry starts back at retrieval
    }
)

builder.add_edge("done", END)
adobe_rag_app_with_reranker = builder.compile()

In [35]:
question1="What is Adobe's revenue in 2024 and how does it compare to 2022?"
question2="What was Adobe's total revenue for Fiscal Year 2024?"
question3="Based on the 'Digital Media ARR' chart, what was the ending ARR for 2024?"
question4="How is Adobe integrating generative AI across its Creative Cloud suite?"
question5="Based on the R&D spend and product roadmap, which product is Adobe prioritizing for 2025?."


In [30]:
initial_state = RAGReflectionState(
        question=question1
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   What is Adobe's revenue in 2024 and how does it compare to 2022?
--- FINAL ANSWER ---

Adobe's total revenue in 2024 is $21,505 million. Compared to 2022, when the total revenue was $17,606 million, Adobe's revenue in 2024 increased by $3,899 million.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='60579121-f7f2-4bb2-b879-165ab36a8c2c', metadata={'source': 'adbe-Q4 and FY24 Earnings.pdf', 'page': 2, 'type': 'text'}, page_content='The following table summarizes Adobe’s fiscal year 2025 targets1: \n \nTotal revenue \n$23.30 billion to $23.55 billion \nDigital Media segment revenue \n$17.25 billion to $17.40 billion \nDigital Media ending ARR growth \n11.0% year over year \nDigital Experience segment revenue \n$5.80 billion to $5.90 billion \nDigital Experience subscription revenue \n$5.375 billion to $5.425 billion \nEarnings per share \nGAAP: $15.80 to $16.10 \nNon-GAAP: $20.20 to $20.50 \n1 Targets assume non-GAAP operating margin of ~46 percent, non-GAAP tax rate of 

In [31]:
initial_state = RAGReflectionState(
        question=question1
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   What is Adobe's revenue in 2024 and how does it compare to 2022?
--- FINAL ANSWER ---

Adobe's revenue in 2024 is $21,505 million. Compared to 2022, where the revenue was $17,606 million, there is an increase of $3,899 million.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='60579121-f7f2-4bb2-b879-165ab36a8c2c', metadata={'source': 'adbe-Q4 and FY24 Earnings.pdf', 'page': 2, 'type': 'text'}, page_content='The following table summarizes Adobe’s fiscal year 2025 targets1: \n \nTotal revenue \n$23.30 billion to $23.55 billion \nDigital Media segment revenue \n$17.25 billion to $17.40 billion \nDigital Media ending ARR growth \n11.0% year over year \nDigital Experience segment revenue \n$5.80 billion to $5.90 billion \nDigital Experience subscription revenue \n$5.375 billion to $5.425 billion \nEarnings per share \nGAAP: $15.80 to $16.10 \nNon-GAAP: $20.20 to $20.50 \n1 Targets assume non-GAAP operating margin of ~46 percent, non-GAAP tax rate of ~18.5 percent and dilute

In [32]:
initial_state = RAGReflectionState(
        question=question2
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   What was Adobe's total revenue for Fiscal Year 2024?
--- FINAL ANSWER ---

Adobe's total revenue for Fiscal Year 2024 was $21,505 million.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='60579121-f7f2-4bb2-b879-165ab36a8c2c', metadata={'source': 'adbe-Q4 and FY24 Earnings.pdf', 'page': 2, 'type': 'text'}, page_content='The following table summarizes Adobe’s fiscal year 2025 targets1: \n \nTotal revenue \n$23.30 billion to $23.55 billion \nDigital Media segment revenue \n$17.25 billion to $17.40 billion \nDigital Media ending ARR growth \n11.0% year over year \nDigital Experience segment revenue \n$5.80 billion to $5.90 billion \nDigital Experience subscription revenue \n$5.375 billion to $5.425 billion \nEarnings per share \nGAAP: $15.80 to $16.10 \nNon-GAAP: $20.20 to $20.50 \n1 Targets assume non-GAAP operating margin of ~46 percent, non-GAAP tax rate of ~18.5 percent and diluted share count of ~433 \nmillion for fiscal year 2025. \nThe following table summarizes Ad

In [33]:
initial_state = RAGReflectionState(
        question=question3
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   Based on the 'Digital Media ARR' chart, what was the ending ARR for 2024?
--- FINAL ANSWER ---

The ending ARR for Digital Media in 2024 was between $21.30 billion and $21.50 billion.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='57289b00-74ab-457b-ac4e-46d48710f4fc', metadata={'source': 'adbe-investor-update-2024.pdf', 'page': 4, 'type': 'text'}, page_content='$21.30B to $21.50B\n$21.40B to $21.50B\nDigital Media net new ARR\n~$1.90B\n~$1.95B\nDigital Media segment revenue\n$15.75B to $15.85B\n$15.80B to $15.85B\nDigital Experience segment revenue\n$5.275B to $5.375B\n$5.325B to $5.375B\nDigital Experience subscription revenue\n$4.75B to $4.80B\n$4.775B to $4.825B\nGAAP diluted earnings per share\n$13.45 to $13.85\n$11.80 to $12.00\nNon-GAAP diluted earnings per share\n$17.60 to $18.00\n$18.00 to $18.20\nGAAP tax rate\n~18.0%\n~20.5%\nNon-GAAP tax rate\n~18.5%\n~18.5%'), Document(id='e70e54d3-2b6c-4581-8e13-3fb1ebf7a092', metadata={'source': 'adbe-2024-annual-repor

In [34]:
initial_state = RAGReflectionState(
        question=question4
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   How is Adobe integrating generative AI across its Creative Cloud suite?
--- FINAL ANSWER ---

Adobe is integrating generative AI across its Creative Cloud suite by embedding AI capabilities into its applications to enhance creativity and productivity. This includes features like AI-driven content creation, automated design suggestions, and enhanced editing tools that leverage machine learning to assist users in generating and refining creative content more efficiently. These integrations aim to streamline workflows and provide users with innovative tools to expand their creative possibilities.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='0f439768-5fe4-4947-a522-99f75b85d1bd', metadata={'source': 'adbe-2024-annual-report.pdf', 'page': 5, 'type': 'text'}, page_content='our customers can easily generate and edit visuals directly in PDFs. We are unlocking business workflows through PDF and \nAdobe Acrobat Sign Application Programming Interfaces (“APIs”); accelerating D

In [35]:
initial_state = RAGReflectionState(
        question=question5
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   Based on the R&D spend and product roadmap, which product is Adobe prioritizing for 2025?.
--- FINAL ANSWER ---

I'm sorry, but there is no specific context provided regarding Adobe's R&D spend and product roadmap for 2025. Please provide the necessary details or context to assist you better.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='60579121-f7f2-4bb2-b879-165ab36a8c2c', metadata={'source': 'adbe-Q4 and FY24 Earnings.pdf', 'page': 2, 'type': 'text'}, page_content='The following table summarizes Adobe’s fiscal year 2025 targets1: \n \nTotal revenue \n$23.30 billion to $23.55 billion \nDigital Media segment revenue \n$17.25 billion to $17.40 billion \nDigital Media ending ARR growth \n11.0% year over year \nDigital Experience segment revenue \n$5.80 billion to $5.90 billion \nDigital Experience subscription revenue \n$5.375 billion to $5.425 billion \nEarnings per share \nGAAP: $15.80 to $16.10 \nNon-GAAP: $20.20 to $20.50 \n1 Targets assume non-GAAP operating ma

In [48]:
initial_state = RAGReflectionState(
        question="Who is the current CEO of Adobe's main competitor, Canva?"
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
# print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   Who is the current CEO of Adobe's main competitor, Canva?
--- FINAL ANSWER ---

I am an Adobe-specific assistant. I can only answer questions related to Adobe's reports and products.

--- ATTEMPTS: 0 ---

--- CONTEXT DOC: [] ---

--- Reranked CONTEXT DOC: [] ---



In [51]:
initial_state = RAGReflectionState(
        question="Which departments of adobe are underperforming in 2024 as per annual report?"
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

QUESTION--   Which departments of adobe are underperforming in 2024 as per annual report?
--- FINAL ANSWER ---

I'm sorry, but I don't have access to the 2024 annual report or any specific data regarding the performance of Adobe's departments in 2024. My training only includes data up to October 2023. For the most accurate and up-to-date information, I recommend checking Adobe's official financial reports or press releases.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='8e6ec87c-55d6-4724-9d9f-45b3bbbe646c', metadata={'source': 'adbe-2025-proxy-statemnt.pdf', 'page': 64, 'type': 'text'}, page_content='•\ncompanies that compete with us for talent;\n•\npositive revenue growth; and\n•\ncompanies that list Adobe as a peer.\nBased on the factors described above, acquisition of prior peers and input from management and Compensia, the \nCommittee approved the below peer group for fiscal year 2024. \n54'), Document(id='aaa499d6-8a69-48b2-b128-81b5e4850074', metadata={'source': 'adbe-2025

In [25]:
#### List of questions -

# Financial Performance (Numerical Accuracy)
"What was Adobe's total revenue for Fiscal Year 2024?",
"How does the Q4 2024 revenue compare to Q4 2023?",
"What were the GAAP vs. Non-GAAP operating margins reported in the update?",
"Which business segment—Digital Media or Digital Experience—showed higher year-over-year growth?",
"What is Adobe's revenue guidance for the full year 2025?",

# AI & Strategy (Semantic Understanding)
"How is Adobe integrating generative AI across its Creative Cloud suite?",
"What are the primary use cases mentioned for Adobe Firefly in the enterprise sector?",
"What does the report cite as Adobe's primary competitive advantage in the AI era?",
"How is Adobe planning to monetize its new AI-driven features (e.g., 'Generative Credits')?",
"Mention any key strategic partnerships highlighted in the 2024 update.",

# Multimodal / Image-Based (Testing CLIP & Vision)
"Based on the 'Digital Media ARR' chart, what was the ending ARR for 2024?",
"Look at the slide showing the 'Content Supply Chain.' What are the four main stages of the workflow?",
"According to the Acrobat AI Assistant slide, what percentage increase in productivity was noted?",
"Which external partner logos appear on the 'Ecosystem' slide?",

# Complex Reasoning (Multi-Step Retrieval)
"Summarize the three biggest risks Adobe identifies regarding the adoption of AI.",
"Based on the R&D spend and product roadmap, which product is Adobe prioritizing for 2025?",
"The report mentions 'Record Revenue,' but are there any segments that saw a decline? If so, which ones?",
"What is the percentage change in Adobe's stock repurchase program compared to the previous year?",


# Guardrails & Robustness (Agent Reflection)
"Who is the current CEO of Adobe's main competitor, Canva?",
"What was Adobe's exact electricity bill for its San Jose headquarters in 2024?"


"What was Adobe's exact electricity bill for its San Jose headquarters in 2024?"

### Assuming the available Adobe document do not contain any scanned images - below approach is taken with better embedding model and retrival

In [14]:
import torch
from langchain_huggingface import HuggingFaceEmbeddings

# Check for Mac GPU (MPS)
device = "cpu"

# Initialize BGE-Large
# BGE v1.5 requires a specific query instruction for better retrieval
model_name = TEXT_EMBEDDING
model_kwargs = {'device': device}
encode_kwargs = {'normalize_embeddings': True} # Essential for BGE v1.5

bge_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


INFO: Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5


In [15]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_unstructured import UnstructuredLoader

# This scans the folder and uses Unstructured for each file found
loader = DirectoryLoader(
    DATA_FOLDER, 
    glob="**/*.*", # Adjust pattern to filter files (e.g., "**/*.pdf")
    loader_cls=UnstructuredLoader,
    show_progress=True,
    use_multithreading=True,
    loader_kwargs={"languages": ["eng"], "strategy": "fast"}
)

documents = loader.load()
print(f"Loaded {len(documents)} documents from the folder.")


 89%|████████▉ | 8/9 [00:10<00:01,  1.36s/it]

Loaded 9031 documents from the folder.





In [17]:
len(documents)

9031

In [18]:
### Using different embedding strategy

from langchain_unstructured import UnstructuredLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


# 2. Apply Recursive Chunking
# 1000 characters is ~250 tokens, well within BGE's 512-token limit
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    add_start_index=True,
    strip_whitespace=True
)

all_chunks = text_splitter.split_documents(documents)

# 3. Add Metadata for your RAG Agent
for i, chunk in enumerate(all_chunks):
    chunk.metadata["type"] = "text"
    chunk.metadata["chunk_id"] = i

print(f"Total chunks created: {len(all_chunks)}")


Total chunks created: 9073


In [19]:
# Create the vector store
# This will take a moment as it processes 1800+ chunks on your Mac
text_vectorstore = FAISS.from_documents(
    documents=all_chunks,
    embedding=bge_embeddings
)

# Save the index locally so you don't have to re-embed next time
text_vectorstore.save_local("./vectore_store_db/faiss_bge_index")

print("BGE Vector Store created and saved successfully!")


INFO: Loading faiss.
INFO: Successfully loaded faiss.


BGE Vector Store created and saved successfully!


In [64]:
from typing import List, Optional
from pydantic import BaseModel, Field
from typing import List, Optional

class RAGReflectionState(BaseModel):
    # Required field
    question: str
    
    # Fields with explicit defaults (making them truly optional for .invoke)
    retrieved_docs: List[Document] = Field(default_factory=list) 
    rerank_retrieved_docs: List[Document] = Field(default_factory=list)
    answer: Optional[str] = None
    reflection: Optional[str] = None
    revised: bool = False
    attempts: int = 0
    is_allowed: bool = True


In [65]:
from langgraph.graph import StateGraph, END
import json


# --- NODE 1: RETRIEVER ---
def retriever_node(state: RAGReflectionState) -> RAGReflectionState:
    query = state.question 
    # Using your unified CLIP retrieval function
    docs = text_vectorstore.similarity_search(query, k=8)
    return {"retrieved_docs": docs}


## NODE 2 : ReRANKER
def rerank_documents_node(state: RAGReflectionState):
    query = state.question
    docs = state.retrieved_docs
    
    if not docs:
        return {"retrieved_docs": [], "rerank_retrieved_docs": []}

    # Format documents for the LLM
    doc_texts = "\n".join([
        f"ID: {i}\nContent: {doc.page_content[:500]}" 
        for i, doc in enumerate(docs)
    ])

    prompt = f"""
    You are an expert search ranker. Given the user query and a list of documents, 
    determine which documents are most relevant to answer the question.
    
    Query: {query}
    
    Documents:
    {doc_texts}
    
    Return only a JSON list of IDs in order of relevance, for example: [2, 0, 3].
    Only include IDs of documents that are truly relevant.
    """

    # Get ranking from LLM
    response = llm.invoke(prompt)
    try:
        # Extract ID list from response
        # Using a simple strip in case LLM adds markdown backticks
        raw_content = response.content.replace("```json", "").replace("```", "").strip()
        relevant_ids = json.loads(raw_content)
        
        # Reorder and filter original docs based on LLM decision
        rerank_retrieved_docs = [docs[idx] for idx in relevant_ids if idx < len(docs)]
    except Exception as e:
        print(f"Reranking failed, falling back to original docs: {e}")
        rerank_retrieved_docs = docs[:3] # Fallback to top 3

    return {"rerank_retrieved_docs": rerank_retrieved_docs}



# --- NODE 3: ANSWER GENERATOR ---

from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate


def generate_answer_node(state: RAGReflectionState):
    # 1. Format the retrieved docs into a readable string for the LLM
    # Passing raw Document objects inside an f-string can be messy
    context_str = "\n\n".join([
        f"[Source: {d.metadata.get('source', 'Unknown')}, Page: {d.metadata.get('page_number', 'N/A')}]\n{d.page_content}"
        for d in state.rerank_retrieved_docs
    ])
    
    system_message = """
            You are the "Adobe AI Leadership Insight & Decision Agent," a specialized financial analyst.
            Your goal is to provide high-fidelity answers based on Adobe's internal reports, charts, and data.

            STRICT GUIDELINES:
            1. ONLY use the provided context to answer.
            2. If the answer is not in the context, state: "I'm sorry, I cannot find that information in the current Adobe reports."
            3. When referencing a chart or image, specify the Page Number.
            4. If there is a conflict between an image (chart) and text, prioritize the data found in the image.
            5. Be concise, professional, and use bullet points for financial data.
            """

    # 2. Use placeholders {} in the template instead of f-strings
    # This allows the Template object to handle the insertion correctly
    human_template = f"""
            ### CONTEXT DATA:
            {context_str}

            ### USER QUESTION:
            {state.question}

            ### YOUR ANALYSIS:
            """

    # 3. Create the Template
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system_message),
        ("human", human_template)
    ])

    # 4. FIX: Format the template with actual data to get a list of messages
    messages = prompt_template.format_messages(
        context=context_str,
        question=state.question
    )

    # 5. Pass the formatted messages to the LLM
    response = llm.invoke(messages)
        
    return {
        "answer": response.content.strip(),
        "attempts": state.attempts + 1
    }


# --- NODE 4: REFLECTOR ---
def reflection_node(state: RAGReflectionState)  -> RAGReflectionState:
    prompt = f"""
    Reflect on the following answer and context. State "YES" if satisfactory or "NO" if not.
    Question: {state.question}
    Answer: {state.answer}
    Context: {state.rerank_retrieved_docs}
    Respond like:
    Reflection: YES or NO. 
    Reflection score: 0-10 (10 being perfect)
    Explanation: ...
    """
    
    reflection = llm.invoke(prompt).content
    is_ok = "reflection: yes" in reflection.lower()
    
    
    return {
        "reflection": reflection, 
        "revised": not is_ok
    }

# --- NODE 5: FINALIZER ---
# def finalize_node(state: RAGReflectionState)    -> RAGReflectionState:
#     # This node just acts as a cleanup/formatting step before END
#     return state


In [66]:
# In your finalize_node, handle the blocked message
def finalize_node(state: RAGReflectionState):
    if not state.is_allowed:
        return {"answer": "I am an Adobe-specific assistant. I can only answer questions related to Adobe's reports and products."}
    # ... (rest of your finalize logic)
    return state

In [67]:
def guardrail_node(state: RAGReflectionState):
    prompt = f"""
    You are a security gatekeeper for an Adobe Financial RAG agent. 
    Your task is to determine if the following question is related to Adobe (its finances, products like Photoshop/Firefly, strategy, or reports).

    Question: {state.question}

    If the question is about Adobe, respond exactly with "ALLOWED".
    If it is NOT about Adobe (e.g., asking about cooking, other companies like Apple, or general trivia), respond with "BLOCKED".
    """
    
    response = llm.invoke(prompt).content.strip().upper()
    is_allowed = "ALLOWED" in response
    
    return {"is_allowed": is_allowed}


In [68]:
## Building graph workflow 


# Conditional Logic: Loop back to retriever if "revised" is True and attempts < 2
def route_after_reflection(state: RAGReflectionState):
    if not state.revised or state.attempts >= 2:
        return "complete"
    else:
        return "retry"
    



In [69]:
#### Rebuild with Reranking Node 

builder = StateGraph(RAGReflectionState)



# Add Nodes
builder.add_node("guardrail", guardrail_node)
builder.add_node("retriever", retriever_node)
builder.add_node("reranker", rerank_documents_node) 
builder.add_node("answer_generator", generate_answer_node)
builder.add_node("reflector", reflection_node)
builder.add_node("done", finalize_node)

# # Update Flow (Edges)
# builder.set_entry_point("retriever")



# Define the gatekeeping flow
builder.set_entry_point("guardrail")

builder.add_conditional_edges(
    "guardrail",
    lambda s: "continue" if s.is_allowed else "stop",
    {
        "continue": "retriever",
        "stop": "done" # Skip directly to finalize
    }
)
builder.add_edge("retriever", "reranker")        
builder.add_edge("reranker", "answer_generator")
builder.add_edge("answer_generator", "reflector")

# Update Conditional Logic
builder.add_conditional_edges(
    "reflector",
    route_after_reflection,
    {
        "complete": "done",
        "retry": "retriever" # Retry starts back at retrieval
    }
)

builder.add_edge("done", END)
adobe_rag_app_with_reranker = builder.compile()

In [70]:
initial_state = RAGReflectionState(
        question=question5
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


QUESTION--   Based on the R&D spend and product roadmap, which product is Adobe prioritizing for 2025?.
--- FINAL ANSWER ---

I'm sorry, I cannot find that information in the current Adobe reports.

--- ATTEMPTS: 2 ---

--- CONTEXT DOC: [Document(id='5ca40677-328a-467b-b152-7dd106c7b487', metadata={'source': 'data/adbe-Q4 and FY24 Earnings.pdf', 'coordinates': {'points': ((49.464, 376.72200000000004), (49.464, 386.48784), (286.39456, 386.48784), (286.39456, 376.72200000000004)), 'system': 'PixelSpace', 'layout_width': 612.0, 'layout_height': 792.0}, 'file_directory': 'data', 'filename': 'adbe-Q4 and FY24 Earnings.pdf', 'languages': ['eng'], 'last_modified': '2026-02-20T11:51:28', 'page_number': 2, 'filetype': 'application/pdf', 'parent_id': '56a55169e1ca3108bcacf5211693b934', 'category': 'NarrativeText', 'element_id': '4e4d5aa3e89245e25c421ef477b2153b', 'start_index': 0, 'type': 'text', 'chunk_id': 31}, page_content='The following table summarizes Adobe’s fiscal year 2025 targets1:'), 

In [71]:
initial_state = RAGReflectionState(
        question="Who is the current CEO of Adobe's main competitor, Canva?"
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
# print(f"--- Reflection: {final_state['reflection']} ---")

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


QUESTION--   Who is the current CEO of Adobe's main competitor, Canva?
--- FINAL ANSWER ---

I am an Adobe-specific assistant. I can only answer questions related to Adobe's reports and products.

--- ATTEMPTS: 0 ---

--- CONTEXT DOC: [] ---

--- Reranked CONTEXT DOC: [] ---



In [72]:
initial_state = RAGReflectionState(
        question="Which departments of adobe are underperforming in 2024 as per annual report?"
    )
final_state = adobe_rag_app_with_reranker.invoke(initial_state)

print("QUESTION--  ",initial_state.question)

print(f"--- FINAL ANSWER ---\n\n{final_state['answer']}\n")
print(f"--- ATTEMPTS: {final_state['attempts']} ---\n")
print(f"--- CONTEXT DOC: {final_state['retrieved_docs']} ---\n")
print(f"--- Reranked CONTEXT DOC: {final_state['rerank_retrieved_docs']} ---\n")
print(f"--- Reflection: {final_state['reflection']} ---")

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


QUESTION--   Which departments of adobe are underperforming in 2024 as per annual report?
--- FINAL ANSWER ---

I'm sorry, I cannot find that information in the current Adobe reports.

--- ATTEMPTS: 1 ---

--- CONTEXT DOC: [Document(id='dad80a45-5917-49f4-a47e-7b18b52bbcd3', metadata={'source': 'data/adbe-Q4 and FY24 Earnings.pdf', 'coordinates': {'points': ((49.464, 274.19056), (49.464, 296.27056), (520.2594399999999, 296.27056), (520.2594399999999, 274.19056)), 'system': 'PixelSpace', 'layout_width': 612.0, 'layout_height': 792.0}, 'file_directory': 'data', 'filename': 'adbe-Q4 and FY24 Earnings.pdf', 'languages': ['eng'], 'last_modified': '2026-02-20T11:51:28', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title', 'element_id': '6ceb55a46a99a9636163f3c0135d237e', 'start_index': 0, 'type': 'text', 'chunk_id': 4}, page_content='Adobe Reports Record Q4 and Fiscal 2024 Revenue'), Document(id='b9db6a64-0076-4ab4-9e08-b7418822a694', metadata={'source': 'data/adbe-2025-prox