<a href="https://colab.research.google.com/github/prem-cre/Multirag/blob/main/hub1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title Install all required libraries
!pip install -qU langchain langgraph langchain_groq langchain_huggingface
!pip install -qU faiss-cpu pypdf tiktoken
!pip install -qU langchain-community

!pip install -qU langchain-google-community

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.0/55.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.7/216.7 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [20]:
# @title Configure API Keys, LLM, and FAISS Vector Store
import os
import glob
import pickle
from google.colab import userdata

os.environ["GROQ_API_KEY"] = userdata.get('groq_api_key')

from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(model_name="gemma2-9b-it", temperature=0)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


def read_documents(directory_path: str):
    """Loads PDF documents from a specified directory."""
    loader = PyPDFDirectoryLoader(directory_path)
    documents = loader.load()
    print(f"Files found by PyPDFDirectoryLoader: {[doc.metadata.get('source') for doc in documents]}")
    return documents

def chunk_data(docs, chunk_size=1100, chunk_overlap=70):
    """Splits documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(docs)
    return chunks

# Define the path for saving the FAISS index and document list
FAISS_DB_PATH = "faiss_index"
DOCUMENT_LIST_PATH = "indexed_documents.pkl"

# Get the list of current PDF files in the directory
current_pdf_files = sorted(glob.glob('/content/*.pdf'))
print(f"Current PDF files in /content/: {current_pdf_files}")

# Load the list of previously indexed documents if it exists
indexed_pdf_files = []
if os.path.exists(DOCUMENT_LIST_PATH):
    with open(DOCUMENT_LIST_PATH, 'rb') as f:
        indexed_pdf_files = pickle.load(f)
print(f"Previously indexed PDF files: {indexed_pdf_files}")

# Check if the FAISS index exists and if the document list has changed
if os.path.exists(FAISS_DB_PATH) and current_pdf_files == indexed_pdf_files:
    print("Document list has not changed and FAISS index exists.")
    print(f"Loading existing FAISS vector store from {FAISS_DB_PATH}...")
    vector_store = FAISS.load_local(FAISS_DB_PATH, embeddings, allow_dangerous_deserialization=True)
    retriever = vector_store.as_retriever()
    print("FAISS vector store and retriever loaded.")
else:
    print("Document  list has changed or FAISS index does not exist. Rebuilding index.")
    # Load, chunk, and index the documents
    print("Loading documents that are uploaded...")
    docs = read_documents('/content/')
    print(f"Number of documents loaded: {len(docs)}") # Added print for number of docs
    documents = chunk_data(docs)
    print(f"Total chunks created: {len(documents)}")
    print(f"First 5 chunks: {documents[:5]}") # Added print for first few chunks

    print("Creating FAISS vector store...")
    vector_store = None
    retriever = None

    if documents:
        vector_store = FAISS.from_documents(documents, embeddings)
        retriever = vector_store.as_retriever()
        print("FAISS vector store and retriever are ready.")
        # Save the FAISS index to disk
        vector_store.save_local(FAISS_DB_PATH)
        # Save the list of indexed documents
        with open(DOCUMENT_LIST_PATH, 'wb') as f:
            pickle.dump(current_pdf_files, f)
        print(f"FAISS vector store saved to {FAISS_DB_PATH}.")
        print(f"Indexed document list saved to {DOCUMENT_LIST_PATH}.")
    else:
        print("No documents loaded or chunked. Skipping FAISS vector store creation.")


# # Test the retriever
# test_query = 'How does the radical separation of law and religion contribute to the integrity crisis in Western civilization, and can reintegrating their underlying values restore societal cohesion without compromising secular governance?'
# if retriever: # Check if retriever is not None before invoking
#     retrieved_docs = retriever.invoke(test_query)
#     print(retrieved_docs)
#     print(f"\n✅ Retriever test successful. Found {len(retrieved_docs)} related docs.")
#     # print(retrieved_docs[0].page_content)
# else:
#     print("\nSkipping retriever test as no documents were loaded.")

Current PDF files in /content/: ['/content/national_policy_on_software_products-2019.pdf', '/content/personnel-policy-for-group-a-s-and-t-officers-of-meity-and-its-organizations.pdf']
Previously indexed PDF files: []
Document list has changed or FAISS index does not exist. Rebuilding index.
Loading documents that are uploaded...
Files found by PyPDFDirectoryLoader: ['/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019.pdf', '/content/national_policy_on_software_products-2019

In [30]:
# @title Create the Compliance Agent and Executor
from langchain import hub
from langchain.agents import create_react_agent, AgentExecutor
from langchain_core.tools import tool # Import tool decorator


# --- 1. Define the Agent's Core Prompt ---
# This prompt includes explicit formatting for ReAct
custom_instructions = """
You are Guardian, an expert AI compliance agent. Your sole purpose is to review a given blog post and ensure it is 100% compliant with company policies.

**Your Goal:** Rewrite the blog post to be fully compliant, providing a detailed report of your changes.

**Your Process:**
1.  **Analyze the Blog:** Read the blog post and identify potential risk areas (e.g., mentions of data, technology, external content).
2.  **Investigate with Tools:** For each risk area, use the `policy_search_tool` to find the relevant company policies. You may need to use the tool multiple times for different topics.
3.  **Synthesize Findings:** Once you have gathered all relevant policies, compare them against the blog post to identify specific violations.
4.  **Formulate Final Answer:** If there are no violations, state that the blog is compliant and no changes are needed. If violations are found, you MUST structure your final answer as follows, with no other text before or after it:

    ### ❌ Compliance Violations Found

    **Reasons and Citations:**
    - **Violation 1:** [Clear explanation of the first violation]
      - **Citation:** [Quote the specific rule from the policy document, especially Indian law references.]
    - **Violation 2:** [Clear explanation of the second violation]
      - **Citation:** [Quote the specific rule from the policy document.]
    ... (and so on for all violations)

    ---

    ### Refined (Compliant) Blog Post:
    [Provide the full text of the rewritten blog post here. Change ONLY what is necessary to fix the violations.]
"""

# --- 2. Pull a base prompt template from LangChain Hub ---
# This prompt is designed for ReAct agents and structures the thought/action/observation loop.
base_prompt = hub.pull("hwchase17/react")

# --- 3. Inject our custom instructions into the base prompt ---
# Use partial to add the custom instructions to the base prompt
prompt = base_prompt.partial(instructions=custom_instructions)


# --- 4. Define the tools (Placeholder - Replace with your actual tools) ---
# You need to define your tools here before they are used in create_react_agent
@tool
def policy_search_tool(query: str) -> str:
    """Searches company policies for relevant information based on the query."""
    # Replace this with your actual policy search logic
    return f"Searching policies for: {query}. (Placeholder result)"

tools = [policy_search_tool] # Add your actual tools to this list

# --- 5. Create the Agent ---
# This binds the LLM, the tools, and the prompt together.
# Pass the tools to the partial prompt to format it correctly
agent = create_react_agent(llm, tools, prompt.partial(tools=tools))


# --- 6. Create the Agent Executor ---
# The executor is the runtime that lets the agent think and use its tools.
# `verbose=True` shows the agent's step-by-step reasoning.
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=25 # Keep max iterations to give it more chances
)

print("✅ Compliance agent 'Guardian' and its executor are ready.")

✅ Compliance agent 'Guardian' and its executor are ready.


In [22]:
# @title Define Conditional Edges and Build the Graph
# This step is not required for our AI Agent architecture.
# The AgentExecutor created in the previous step contains the entire runtime loop.
# It dynamically decides which tools to call based on the agent's reasoning,
# effectively creating its own path through the logic.

print("✅ Graph construction is not needed for this agentic approach.")

✅ Graph construction is not needed for this agentic approach.


In [None]:
# @title Run Agent: Violating Blog Post Scenario
from IPython.display import display, Markdown

# --- The blog post with multiple compliance issues ---
violating_blog = """
# A Major Breakthrough by InnovateCorp

We are excited to share news about a huge leap forward! Our new QuantumLeap Engine is now available to our partners in Iran. This technology is a game-changer.

To get early access and our whitepaper, just send us your name and email. We'll add you to our list.

A famous tech blogger said our work is "truly revolutionary," and we've included a picture from their latest article to show you.
"""

# --- Define the final task for the agent ---
agent_task = f"""
Please review the following blog post, identify all compliance violations based on your tools,
and provide a final report with a rewritten version, following your instructions precisely.

**Blog Post:**
{violating_blog}
"""

# --- Invoke the agent ---
result = agent_executor.invoke({"input": agent_task})

# --- Display the final, formatted output ---
print("\n\n" + "="*80 + "\n")
display(Markdown("## Agent's Final Report (Violating Blog):"))
display(Markdown(result['output']))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to check company policy on doing business in Iran and sharing customer data. I also need to see if there are guidelines on using endorsements in marketing materials. 

Action: policy_search_tool
Action Input: Company policy on doing business in Iran[0m[36;1m[1;3mSearching policies for: Company policy on doing business in Iran. (Placeholder result)[0m[32;1m[1;3mThought: I need to check company policy on doing business in Iran and sharing customer data. I also need to see if there are guidelines on using endorsements in marketing materials. 

Action: policy_search_tool
Action Input: Company policy on doing business in Iran[0m[36;1m[1;3mSearching policies for: Company policy on doing business in Iran. (Placeholder result)[0m[32;1m[1;3mThought: I need to check company policy on doing business in Iran and sharing customer data. I also need to see if there are guidelines on using endorsements in marketin

In [25]:
# @title Run Agent: Compliant Blog Post Scenario
from IPython.display import display, Markdown

# --- A compliant blog post with no issues ---
compliant_blog = """
# Our Newest Software Update!

We are thrilled to announce the latest update to our flagship product. This new version includes performance enhancements and bug fixes.
Our team has been working hard to deliver these improvements. We believe in building great products for our customers around the world.
For more details, check out our official product page.
"""

# --- Define the final task for the agent ---
# Removed extra instructions as the prompt now tells the agent to immediately analyze the input
agent_task_compliant = compliant_blog


# --- Invoke the agent ---
# Pass the compliant_blog directly as the 'input'
# Increase max_iterations to give the agent more chances
result_compliant = agent_executor.invoke({"input": agent_task_compliant}, {"max_iterations": 25})

# --- Display the final, formatted output ---
print("\n\n" + "="*80 + "\n")
display(Markdown("## Agent's Final Report (Compliant Blog):"))
display(Markdown(result_compliant['output']))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' after 'Thought:'[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' after 'Thought:'[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' after 'Thought:'[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' after 'Thought:'[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' after 'Thought:'[32;1m[1;3mOkay, I'm ready to analyze the blog post. Please provide the text so I can begin my review.  
[0mInvalid Format: Missing 'Action:' af

## Agent's Final Report (Compliant Blog):

Agent stopped due to iteration limit or time limit.