# Parser

In [9]:
# backend/sitemap_parser.py
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urlunparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def normalize_url(url: str) -> str:
    url = url.lower().strip() # Remove leading/trailing whitespace and convert to lowercase
    if not url.startswith("http"):
        url = "https://" + url
    
    if url.startswith("https://") and not url.startswith("https://www."):
        url = url.replace("https://", "https://www.", 1)
    elif not url.startswith("https://www."):
        url = "https://www." + url
    
    parsed = urlparse(url)

    # Normalize to scheme + netloc only (strip path, params, query, fragment)
    normalized_url = urlunparse((parsed.scheme, parsed.netloc, '', '', '', ''))
    return normalized_url

def fetch_sitemap(url): # Fetch each xml sitemap in one layer.
    try:
        response = requests.get(url, headers=HEADERS, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml-xml')
        return [loc.text for loc in soup.find_all('loc')]
    except Exception as e:
        print(f"Error: {e}")
        return []
def parse_sitemap(url): # Parse the sitemap and return a dictionary of URLs. Applies fetch_sitemap to each xml sitemap layer by layer.
    locs = fetch_sitemap(url)
    if not locs:
        return {url: []}
    
    tree = {}
    urls = []

    for loc in locs:
        if loc.endswith('.xml'):
            tree[loc] = parse_sitemap(loc)
        else:
            urls.append(loc)

    if tree and urls:
        tree["_final_urls"] = urls
        return tree
    elif urls:
        return urls
    else:
        return {url: tree}

def extract_final_urls(url): # List all URLs in the sitemap.
    
    url = normalize_url(url)
    final_urls = [url]
    if not url.endswith('/sitemap.xml'):
        url += '/sitemap.xml'
    tree = parse_sitemap(url)
    

    def _walk_tree(node):
        if isinstance(node, dict):
            for key, value in node.items():
                if key == "_final_urls" and isinstance(value, list):
                    final_urls.extend(value)
                else:
                    _walk_tree(value)
        elif isinstance(node, list):
            final_urls.extend([v for v in node if not v.endswith('.xml')])

    _walk_tree(tree)

    
    return final_urls, tree

# Scraper

In [10]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time


HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        if text:
            return text
    except:
        pass

    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        time.sleep(3)
        elems = driver.find_elements(By.TAG_NAME, "p")
        text = ' '.join(elem.text for elem in elems).strip()
        driver.quit()
        return text
    except:
        return ""

# The output here is an input for the RAG model.

# Visualizer

In [11]:
import networkx as nx
from pyvis.network import Network
import json

def tree_to_edges(tree, parent=None):
    edges = []
    if isinstance(tree, list):
        for item in tree:
            edges.append((parent, item))
    elif isinstance(tree, dict):
        for key, value in tree.items():
            if parent:
                edges.append((parent, key))
            edges += tree_to_edges(value, parent=key)
    return edges

def generate_graph(sitemap_url, output_file="sitemap_network.html", json_filename=None):
    tree = parse_sitemap(sitemap_url)

    # Default name if not provided
    if json_filename is None:
        json_filename = "sitemap_tree.json"

    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(tree, f, indent=2)

    edges = tree_to_edges(tree[sitemap_url], parent=sitemap_url)

    G = nx.DiGraph()
    G.add_edges_from(edges)
    net = Network(height="750px", width="100%", directed=True, notebook=False)

    for node in G.nodes():
        if node == sitemap_url:
            net.add_node(node, label=str(urlparse(sitemap_url).netloc), title=node, shape='dot', size=30,
                         color={"background": "white", "border": "blue"}, borderWidth=4,
                         font={"color": "black", "size": 35, "bold": True})
        elif node.endswith('.xml'):
            net.add_node(node, label="Sitemap", title=node, shape='dot', size=25)
        else:
            net.add_node(node, label=" ", title=node, shape='dot', size=15,
                         color={"background": "#ccffcc", "border": "#009933"})

    for source, target in G.edges():
        net.add_edge(source, target)

    net.force_atlas_2based()
    net.set_options("""
    { "physics": { "stabilization": false }, "interaction": { "dragNodes": true } }
    """)
    net.write_html(output_file)

    # Read the generated HTML
    with open(output_file, "r", encoding="utf-8") as f:
        html = f.read()

    # Inject JS
    inject_js = f"""
    <script type="text/javascript">
    window.addEventListener("load", function () {{
        // ✅ Hide the Pyvis loading bar if it exists
        const loader = document.getElementById("loadingBar");
        if (loader) loader.style.display = "none";

        const rootNodeId = "{sitemap_url}";
        const titleNodeId = "graph_title";
        const originalLabels = {{}};

        network.on("click", function (params) {{
        if (params.nodes.length > 0) {{
            let clickedNodeId = params.nodes[0];

            nodes.get().forEach(function (node) {{
            if (!(node.id in originalLabels)) {{
                originalLabels[node.id] = node.label;
            }}

            if (node.id === rootNodeId || node.id === titleNodeId) {{
                return;
            }}

            if (node.id === clickedNodeId) {{
                nodes.update({{id: node.id, label: node.title}});
            }} else {{
                nodes.update({{id: node.id, label: ""}});
            }}
            }});
        }} else {{
            nodes.get().forEach(function (node) {{
            if (node.id !== rootNodeId && node.id !== titleNodeId && originalLabels[node.id] === "") {{
                nodes.update({{id: node.id, label: ""}});
            }}
            }});
        }}
        }});
    }});
    </script>
    """

    # Inject before </body>
    html = html.replace("</body>", inject_js + "\\n</body>")

    # Save it back
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"Output files successfully generated: {output_file} and {json_filename}")
    return output_file


# RAG model (using Langchain)
## Database loader (run everyday to update the database based on recent information)

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain_community.vectorstores import SupabaseVectorStore
from supabase import create_client, Client
from tqdm import tqdm
import json
import getpass
import os
import dotenv
from langchain_core.documents import Document
# Load environment variables from .env file
dotenv.load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
else:
  print(f"Openai API key successfully imported from .env file.")
  #print(f"Key: {os.environ.get('OPENAI_API_KEY')}")

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

vector_store = SupabaseVectorStore(
    client=supabase,
    embedding=embeddings,
    table_name="prime_ready.webcontent",  # You can change this
    query_name="match_documents"  # Needs to be created in Supabase SQL
)

def RAG_scraper_loader(company_name, website):

    # delete company data from the database (both tables) - to avoid outdated data the entire company data is removed from both tables
    supabase.rpc("delete_company_by_url", {
        "target_url": website
    }).execute()

    # Extract sitemap URLs
    url_list, tree = extract_final_urls(website)
    
    # Add company into company table in Supabase
    company_id = supabase.rpc("insert_company", {
        "company_name": company_name,
        "link": website,  # ✅ must match SQL function param name
        "sitemap": json.dumps(tree)  # Add the sitemap data

    }).execute().data
    
    

    for link in tqdm(url_list, desc=f"Scraping & indexing ({company_name})", unit="link"):
        # Scrape the URL
        text = scrape(link)
        if not text.strip():
            continue  # skip empty pages

        # Create metadata and document content
        metadata={
                "source": str(link),
                "website": str(website),
                "company_id": str(company_id)
            }

        docs = Document(
            page_content=text,
            metadata=metadata
        )
        
        # Chunking
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=500)
        chunks = text_splitter.split_documents([docs])

        # Index chunks and store in Supabase
        for chunk in chunks:
            # Generate embedding
            vector = embeddings.embed_query(chunk.page_content)

            # Insert into Supabase
            supabase.rpc("insert_webcontent", {
                "company_id": company_id,
                "source": link,
                "content": chunk.page_content,
                "metadata": metadata,
                "embedding": vector  # list of floats; pgvector input accepted here
            }).execute()

company_name = "Northlight AI"
website = "https://northlightai.com"
RAG_scraper_loader(company_name, website)

Openai API key successfully imported from .env file.


Scraping & indexing (Northlight AI): 100%|██████████| 34/34 [00:37<00:00,  1.09s/link]


## Step 2) Retrieval and Generation

### Prompts

In [13]:
from langchain import hub
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from langgraph.graph import START, StateGraph
from IPython.display import Image, display

## Prompt - custom
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use five sentences maximum and keep the answer as concise as possible.
Always start the answer with a sentence like "Thanks for asking question about North Light AI!"; but be innovative and each time use a similar welcoming message.

{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

## State and Nodes
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

## Compile the graph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


### Usage

In [14]:
def chatbot():
    question = input("What would you like to know? ")
    state = graph.invoke({"question": question})
    return print(state["answer"])


In [15]:
chatbot()

Thanks for asking about Dr. Roozbeh Ghasemi at North Light AI! He is the Lead Data & AI Analyst with a multidisciplinary background, currently pursuing a Ph.D. in Civil and Environmental Engineering. Dr. Ghasemi holds an MBA in Information Systems and Business Analytics, and his expertise includes machine learning, AI, and sustainable energy systems. He has professional experience in engineering supervision and project management, focusing on environmental systems. His work at North Light AI bridges theoretical concepts with practical applications to promote sustainable technology.
