<a href="https://colab.research.google.com/github/saurabhvybs/AI-RAG-Chatbot/blob/main/AI_Agent_House_of_Shipping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## HOUSE OF SHIPPING AGENT

In [None]:
# All Installations
!pip install \
    langchain langchain-community langchain-huggingface langchain-google-genai langchain-chroma \
    chromadb sentence-transformers \
    selenium beautifulsoup4 \
    pandas scikit-learn seaborn matplotlib \
    tiktoken flashrank -q

# Install the browser and its driver for Selenium in Colab
!apt-get update -qq
!apt-get install -y chromium-browser chromium-chromedriver -qq

In [None]:
!pip uninstall google-generativeai google-ai-generativelanguage -y
!pip install langchain-google-genai -q

# SCRAPER SERVICE (Selenium)

In [None]:
# Scraper Service
import sys
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- Setup for Colab ---
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# ----------------------

def scrape_website_for_structured_html(start_url: str) -> dict:
    """
    This definitive scraper captures the cleaned HTML structure from the main content
    area of each page, preserving headers for advanced chunking.
    """
    to_visit = {start_url}
    visited = set()
    scraped_data = {}
    base_netloc = urlparse(start_url).netloc
    driver = webdriver.Chrome(options=chrome_options)
    print(f"🚀 Starting structured scrape at: {start_url}")

    while to_visit:
        current_url = to_visit.pop()
        if current_url in visited:
            continue
        try:
            print(f"Scraping: {current_url}")
            driver.get(current_url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            visited.add(current_url)

            # Find the main content area to avoid irrelevant headers/footers
            content_area = soup.find('main') if soup.find('main') else soup.find('body')

            # **Crucially, we clean *within* the HTML, keeping header tags intact**
            if content_area:
                for element in content_area(['script', 'style', 'nav', 'footer', 'aside']):
                    element.decompose() # Remove noise, but keep h1, h2, p, etc.

            scraped_data[current_url] = str(content_area) if content_area else ""

            # Crawling logic remains the same
            for link in soup.find_all('a', href=True):
                # ... (rest of link finding logic is unchanged)
                absolute_link = urljoin(current_url, link['href'])
                parsed_link = urlparse(absolute_link)
                if (parsed_link.netloc == base_netloc and
                    parsed_link.scheme in ['http', 'https'] and
                    absolute_link not in visited and
                    "#" not in absolute_link.split('/')[-1]):
                    to_visit.add(absolute_link)
        except Exception as e:
            print(f"Could not process {current_url}: {e}")

    driver.quit()
    print(f"\n Scraping complete. Found {len(scraped_data)} pages.")
    return scraped_data

# --- Main Execution ---
website_url = "https://houseofshipping.com"
structured_content = scrape_website_for_structured_html(website_url)

output_filename = "structured_web_content.json"
with open(output_filename, "w", encoding='utf-8') as f:
    json.dump(structured_content, f, indent=4, ensure_ascii=False)
print(f" Structured HTML content saved to '{output_filename}'")

# EDA (Exploratory Data Analysis) N-gram Analysis

In [None]:
# Load Structured Data & Perform Full EDA
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup

# --- 1. Load the Scraped Data ---
json_file_path = 'structured_web_content.json'
try:
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(list(data.items()), columns=['url', 'raw_html'])
    print(f" Successfully loaded {len(df)} structured documents from '{json_file_path}'.")
except Exception as e:
    print(f" Error: Could not load the file. Please ensure the scraper in Cell 1 has been run successfully. Error: {e}")


# --- 2. For EDA, extract and clean plain text from HTML ---
# This is for analysis only; the raw_html is preserved for chunking.
df['text_for_eda'] = df['raw_html'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text(separator=' ', strip=True).lower())


# --- 3. Advanced EDA: Find and Visualize Top N-grams ---
print("--- Analyzing Top 2-Word and 3-Word Phrases ---")

def get_top_ngrams(corpus, n=None, ngram_range=(1, 1)):
    """Extracts top n-grams from a text corpus."""
    vec = CountVectorizer(ngram_range=ngram_range, stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Get top 20 bigrams (2-word phrases)
top_bigrams = get_top_ngrams(df['text_for_eda'], n=20, ngram_range=(2, 2))
bigram_df = pd.DataFrame(top_bigrams, columns=['phrase', 'count'])

# Get top 20 trigrams (3-word phrases)
top_trigrams = get_top_ngrams(df['text_for_eda'], n=20, ngram_range=(3, 3))
trigram_df = pd.DataFrame(top_trigrams, columns=['phrase', 'count'])


# --- 4. Visualize the Results ---
plt.figure(figsize=(20, 10))

# Bigrams plot
plt.subplot(1, 2, 1)
sns.barplot(x='count', y='phrase', data=bigram_df, palette='viridis', hue='phrase', legend=False)
plt.title('Top 20 Most Common 2-Word Phrases', fontsize=16)
plt.xlabel("Frequency", fontsize=12)
plt.ylabel("Phrase", fontsize=12)


# Trigrams plot
plt.subplot(1, 2, 2)
sns.barplot(x='count', y='phrase', data=trigram_df, palette='plasma', hue='phrase', legend=False)
plt.title('Top 20 Most Common 3-Word Phrases', fontsize=16)
plt.xlabel("Frequency", fontsize=12)
plt.ylabel("") # Hide y-label for cleaner look

plt.suptitle("Advanced EDA: Key Phrase Analysis", fontsize=20, weight='bold')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# MANUAL Q&A PAIR FEEDING
## For Q&A , for which you don't have specific data.

In [None]:
# Create a string with our high-quality, manual Q&A pairs
manual_qa_content = """
# What specific IT services are offered by House of Shipping?

House of Shipping provides a comprehensive suite of IT services tailored for the logistics and shipping industry, including:
- Custom freight management software development.
- Cybersecurity audits and solutions for logistics networks.
- Cloud migration and infrastructure management (AWS, Azure).
- Data analytics platforms and predictive maintenance dashboards for fleets.
- Implementation of digital twin technologies for supply chain optimization.

# What specific legal services does the company provide?

The legal team at House of Shipping offers several key services to ensure compliance and manage risk:
- Company Incorporation: Assisting businesses in establishing legal entities and streamlining corporate structures.
- Legal Operations Optimization: Refining legal models to align with business goals.
- Compliance and Risk Management: Developing internal audit and corporate compliance programs.
- Strategic Legal Guidance: Helping to develop and monitor a group’s legal strategy.
- Claims Handling: Covering a wide array of needs, leveraging deep knowledge of the maritime and logistics sectors.

# How did House of Shipping assist WeFreight with their expansion?

House of Shipping's marketing and HR teams acted as a strategic partner for WeFreight's global expansion. This included developing a global talent acquisition strategy to quickly staff new international offices and creating targeted marketing campaigns to establish a brand presence in new markets. The direct result was WeFreight's swift and successful launch in multiple new countries, including Mexico, and their recognition as a "Great Place to Work."
"""

# Write the content to a new file
with open("manual_qa.txt", "w") as f:
    f.write(manual_qa_content)

print("✅ Comprehensive manual Q&A file 'manual_qa.txt' created successfully.")

# Load New Knowledge and create data frames

In [None]:
# Read the content from our new manual Q&A file
with open("manual_qa.txt", "r") as f:
    new_knowledge = f.read()

# Create a new row for the DataFrame
new_row = pd.DataFrame([{
    "url": "internal_document_qa.txt",
    "raw_html": f"<html><body>{new_knowledge}</body></html>", # Wrap in basic HTML for the splitter
    "text_for_eda": new_knowledge
}])

# Add this new knowledge to our main DataFrame
df = pd.concat([df, new_row], ignore_index=True)

print(f"✅ New Q&A knowledge added. Total documents now: {len(df)}")

# DATA Preparation and Advanced Cleaning

In [None]:
# Structural Chunking and Templating
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup

# 1. Define the primary splitter that understands HTML structure
headers_to_split_on = [
    ("h1", "H1"),
    ("h2", "H2"),
    ("h3", "H3"),
    ("h4", "H4"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on, return_each_element=False)

# 2. Define a fallback splitter for any large chunks of text without headers
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 3. Process the data from the DataFrame
all_documents = []
for index, row in df.iterrows():
    # Ensure the content is a string
    if not isinstance(row['raw_html'], str) or not row['raw_html'].strip():
        continue

    # First, try to split by headers.
    header_chunks = html_splitter.split_text(row['raw_html'])

    # Now, iterate through these chunks and split any that are too large
    for chunk in header_chunks:
        # Clean up the text content within the chunk
        chunk_text = BeautifulSoup(chunk.page_content, 'html.parser').get_text(separator=' ', strip=True)

        # Skip chunks with very little actual text
        if len(chunk_text.split()) < 10:
            continue

        if len(chunk_text) > 1000:
            # If a chunk is too big, split it recursively but keep its header metadata
            sub_chunks = recursive_splitter.create_documents([chunk_text])
            for sub_chunk in sub_chunks:
                # Add the original header metadata to the new sub-chunk
                sub_chunk.metadata = chunk.metadata.copy()
                sub_chunk.metadata['source'] = row['url']
                all_documents.append(sub_chunk)
        else:
            # If the chunk is a good size, just update its content and metadata
            chunk.page_content = chunk_text
            chunk.metadata['source'] = row['url']
            all_documents.append(chunk)

print(f" Successfully created {len(all_documents)} structurally-aware documents (chunks).")

# Display a sample chunk to see the result
if all_documents:
    print("\n--- SAMPLE CHUNK ---")
    sample_chunk = all_documents[5] # Using a different index for variety
    print(sample_chunk.page_content)
    print(f"\nMetadata: {sample_chunk.metadata}")

# Vector Knowledge base for RAG.

In [None]:
# Build RAG Knowledge Base (Vector Store)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document # Included for clarity

print("\n--- Building RAG Knowledge Base ---")

# 1. Load the Embedding Model
# This model will convert your text chunks into numerical vectors.
print("Loading embedding model (this may take a minute)...")
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'} # Use CPU for broad compatibility
)
print(" Embedding model loaded.")


# 2. Create the Chroma Vector Store from the documents
# This is the step where your chunks are embedded and stored.
print("Storing chunks in the vector database...")
vector_store = Chroma.from_documents(
    documents=all_documents,
    embedding=embeddings,
    persist_directory="./final_chroma_db"  # This saves the database to a local folder for reuse
)
print(" Vector store created.")


print("\n--- RAG Data Preparation Complete! ---")
print("The 'vector_store' variable now holds your complete, searchable knowledge base.")

# Adding GEMNI as the Powerful Generalist Model.

In [None]:
# Setup for Google Gemini
import os
from google.colab import userdata

try:
    os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
    print(" Google API Key configured successfully.")
except userdata.SecretNotFoundError:
    print(" Error: Secret 'GOOGLE_API_KEY' not found. Please add it to your Colab secrets.")

# Loading the Knowledge back into the run-time

In [None]:
# Load the Existing Vector Database
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

print("--- Loading the Knowledge Base from Disk ---")

#  Initialize the same embedding model used to create the database
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# 2. Load the saved vector store from the specified directory
persist_directory = "./final_chroma_db"
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

print(" Knowledge Base loaded successfully!")

# Define and Build RAG Engine

## Prompt Stuffing File.

In [None]:
#  Create the Structured JSON Prompt Files
import json

# 1. Define the prompt structure for the history-aware retriever
contextualize_q_prompt_data = [
    ("system", "Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is."),
    ("placeholder", "{chat_history}"),
    ("human", "{input}"),
]

# 2. Define the prompt structure for the final QA chain
qa_prompt_data = [
    [
        "system",
        "You are an expert assistant for the House of Shipping company. Your tone should be professional and helpful. Answer the user's latest question using both the chat history and the retrieved context.\n\n**Rules:**\n- Synthesize information from both the 'Retrieved Context' and the 'Chat History' to form a complete, conversational answer.\n- If the 'Retrieved Context' does not contain information to answer the latest question, you MUST state: \"I do not have enough information to answer that specific question.\"\n- Do not contradict information that has already been established in the 'Chat History'.\n\n**Retrieved Context:**\n{context}\n"
    ],
    [
        "placeholder",
        "{chat_history}"
    ],
    [
        "human",
        "{input}"
    ]
]

# 3. Write the structures to JSON files
with open("contextualize_prompt.json", "w") as f:
    json.dump(contextualize_q_prompt_data, f, indent=2)

with open("qa_prompt.json", "w") as f:
    json.dump(qa_prompt_data, f, indent=2)

print("✅ Structured prompt files 'contextualize_prompt.json' and 'qa_prompt.json' created.")

# History-Aware Retriever

In [None]:
from langchain.memory import ChatMessageHistory

chat_history = ChatMessageHistory()

print("✅ Chat history object initialized.")

# Create Advance Retriever for Strict Answers from VectorDb.

In [None]:
#  Define the Re-ranking Retriever
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

# 1. Define the LLM and Helper Function
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flask")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 2. Create the BASE retriever from your vector store.
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})

# 3. Instantiate the FlashrankRerank compressor.
#    It's simpler and automatically uses a fast, effective model.
compressor = FlashrankRerank()

# 4. Create the final Contextual Compression Retriever.
retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=base_retriever
)

print(" Advanced re-ranking retriever created using Flashrank.")

# Create Conversational History Aware Retriever.

In [None]:
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser

# Load the structured prompt from its JSON file
with open("contextualize_prompt.json", "r") as f:
    contextualize_q_prompt_data = json.load(f)


# Manually build the list of messages, creating an explicit MessagesPlaceholder object.
messages = []
for role, content in contextualize_q_prompt_data:
    if role == "placeholder":
        # Extract the variable name from inside the curly braces (e.g., "{chat_history}")
        variable_name = content.strip('{}')
        messages.append(MessagesPlaceholder(variable_name=variable_name))
    else:
        messages.append((role, content))

contextualize_q_prompt = ChatPromptTemplate.from_messages(messages)
# --------------------

# Chaining
history_aware_retriever_chain = (
    contextualize_q_prompt
    | llm
    | StrOutputParser()
    | retriever
)

print(" History-aware retriever chain created successfully.")

# Final RAG Engine and Chaining.

In [None]:
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser

# Load the structured prompt for the final answer from its JSON file
with open("qa_prompt.json", "r") as f:
    qa_prompt_data = json.load(f)

# Manually build the list of messages, creating an explicit MessagesPlaceholder object.
messages = []
for item in qa_prompt_data:
    role, content = item
    if role == "placeholder":
        variable_name = content.strip('{}')
        messages.append(MessagesPlaceholder(variable_name=variable_name))
    else:
        messages.append((role, content))
qa_prompt = ChatPromptTemplate.from_messages(messages)


def retrieve_and_format_context(inputs):
    # First, invoke the history-aware retriever to get the documents
    docs = history_aware_retriever_chain.invoke({
        "input": inputs["input"],
        "chat_history": inputs["chat_history"]
    })
    # Then, format the documents into a single string
    return format_docs(docs)
# --------------------


# Build the final conversational RAG chain using the corrected logic
conversational_rag_chain = (
    {
        "input": lambda x: x["input"],
        "chat_history": lambda x: x["chat_history"],
        "context": retrieve_and_format_context
    }
    | qa_prompt
    | llm
    | StrOutputParser()
)

print(" Final conversational RAG chain is built and ready for testing.")

# TESTING

In [None]:
import time
from datetime import datetime

# Define the test suite as a list of test cases
test_suite = [
    {
        "name": "Fact Retrieval about Services",
        "type": "Knowledge",
        "steps": ["What kind of IT services are offered by House of Shipping?"]
    },
    {
        "name": "Conceptual Question from Insights",
        "type": "Knowledge",
        "steps": ["Based on your articles, what is the importance of digital transformation for the shipping industry?"]
    },
    {
        "name": "Rule-Following - Out of Scope Question",
        "type": "Rule-Following",
        "steps": [f"What is the current time in Lucknow? It is currently {datetime.now().strftime('%I:%M:%S %p')}."]
    },
    {
        "name": "Conversational Memory - Simple Follow-up",
        "type": "Memory",
        "steps": [
            "How did House of Shipping assist WeFreight with their expansion?",
            "What was the direct result of that assistance?"
        ]
    },
    {
        "name": "Conversational Memory - Topic Change",
        "type": "Memory",
        "steps": [
            "Tell me about the company's legal services.",
            "Does that also include any marketing services?"
        ]
    },
    {
        "name": "Rule-Following - Plausible but Unavailable Detail",
        "type": "Rule-Following",
        "steps": ["Can you give me the direct phone number for the Global CEO, Alessandra Ronco?"]
    }
]

# --- Test Runner ---
print(" STARTING COMPREHENSIVE AGENT TEST SUITE ")
print("="*50)

for i, test in enumerate(test_suite):
    print(f"\n EXECUTING TEST {i+1}: {test['name']} ({test['type']})")
    print("-"*40)

    # Initialize a fresh, independent chat history for each test case
    test_chat_history = ChatMessageHistory()

    # Loop through the steps (questions) for the current test
    for step, question in enumerate(test['steps']):
        print(f"\nStep {step + 1} of {len(test['steps'])}")
        print(f" You: {question}")

        # Invoke the conversational chain with the current question and the test's history
        answer = conversational_rag_chain.invoke({
            "input": question,
            "chat_history": test_chat_history.messages
        })

        print(f"\n🤖 AI: {answer}")

        # Update the test's chat history for the next step
        test_chat_history.add_user_message(question)
        test_chat_history.add_ai_message(answer)


        # Pause for 20 seconds after every single prompt to respect API rate limits.
        if step < len(test['steps']) - 1: # Optional: prevents waiting after the last step of a test
            print("\n Pausing for 20 seconds...")
            time.sleep(20)
        # --------------------

    print("\n" + "="*50)

print("\n TEST SUITE COMPLETE,,, Shakalaka BOOM BOOM ")

## Now Improve the Prompt Rules for Handling User more Efficiently.

## And If needed simply Ingest more DATA to it and keep the Agent Up-to Date or simply add cron jobs to scrape the site after specific time Interval.