# Parser

In [3]:
# backend/sitemap_parser.py
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urlunparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

In [4]:
def normalize_url(url: str) -> str:
    url = url.lower().strip() # Remove leading/trailing whitespace and convert to lowercase
    if not url.startswith("http"):
        url = "https://" + url
    
    if url.startswith("https://") and not url.startswith("https://www."):
        url = url.replace("https://", "https://www.", 1)
    elif not url.startswith("https://www."):
        url = "https://www." + url
    
    parsed = urlparse(url)

    # Normalize to scheme + netloc only (strip path, params, query, fragment)
    normalized_url = urlunparse((parsed.scheme, parsed.netloc, '', '', '', ''))
    return normalized_url

print(f"Normalized URL: {normalize_url("nortlightai.com")}")
print(f"Normalized URL: {normalize_url("www.nortlightai.com")}")
print(f"Normalized URL: {normalize_url("https://www.nortlightai.com")}")
print(f"Normalized URL: {normalize_url("https://nortlightai.com")}")
print(f"Normalized URL: {normalize_url("https://nortlightai.com/")}")
print(f"Normalized URL: {normalize_url("https://nortlightai.com/content")}")

Normalized URL: https://www.nortlightai.com
Normalized URL: https://www.nortlightai.com
Normalized URL: https://www.nortlightai.com
Normalized URL: https://www.nortlightai.com
Normalized URL: https://www.nortlightai.com
Normalized URL: https://www.nortlightai.com


In [5]:
def fetch_sitemap(url): # Fetch each xml sitemap in one layer.
    try:
        response = requests.get(url, headers=HEADERS, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml-xml')
        return [loc.text for loc in soup.find_all('loc')]
    except Exception as e:
        print(f"Error: {e}")
        return []
print(f"Fetched Sitemap: {fetch_sitemap('https://northlightai.com/sitemap.xml')}") # This should receive .xml URLs
print(f"Fetched Sitemap: {fetch_sitemap('https://northlightai.com')}")

Fetched Sitemap: ['https://northlightai.com/sitemap.website.xml', 'https://northlightai.com/sitemap.blog.xml', 'https://northlightai.com/sitemap.ols.xml']
Fetched Sitemap: []


In [6]:
def parse_sitemap(url): # Parse the sitemap and return a dictionary of URLs. Applies fetch_sitemap to each xml sitemap layer by layer.
    locs = fetch_sitemap(url)
    if not locs:
        return {url: []}
    
    tree = {}
    urls = []

    for loc in locs:
        if loc.endswith('.xml'):
            tree[loc] = parse_sitemap(loc)
        else:
            urls.append(loc)

    if tree and urls:
        tree["_final_urls"] = urls
        return tree
    elif urls:
        return urls
    else:
        return {url: tree}
print(f"Parsed Sitemap: {parse_sitemap('https://northlightai.com/sitemap.xml')}") # This should receive .xml URLs
print(f"Parsed Sitemap: {parse_sitemap('https://northlightai.com')}")

Parsed Sitemap: {'https://northlightai.com/sitemap.xml': {'https://northlightai.com/sitemap.website.xml': ['https://northlightai.com/publications', 'https://northlightai.com/faq', 'https://northlightai.com/our-customers-1', 'https://northlightai.com/', 'https://northlightai.com/ai-strategic-consulting', 'https://northlightai.com/ai-innovation-lab', 'https://northlightai.com/our-team', 'https://northlightai.com/higher-education', 'https://northlightai.com/product-development', 'https://northlightai.com/north-light-values', 'https://northlightai.com/ai-training-%26-development-1', 'https://northlightai.com/technical-partners', 'https://northlightai.com/ai-literacy-sponsorships', 'https://northlightai.com/m/login', 'https://northlightai.com/m/reset', 'https://northlightai.com/m/create', 'https://northlightai.com/m/create-account'], 'https://northlightai.com/sitemap.blog.xml': ['https://northlightai.com/publications/f/north-light-ai-partners-with-unh-to-launch-affordable-ai-training', 'htt

In [7]:
def extract_final_urls(url): # List all URLs in the sitemap.
    
    url = normalize_url(url)
    final_urls = [url]
    if not url.endswith('/sitemap.xml'):
        url += '/sitemap.xml'
    tree = parse_sitemap(url)
    

    def _walk_tree(node):
        if isinstance(node, dict):
            for key, value in node.items():
                if key == "_final_urls" and isinstance(value, list):
                    final_urls.extend(value)
                else:
                    _walk_tree(value)
        elif isinstance(node, list):
            final_urls.extend([v for v in node if not v.endswith('.xml')])

    _walk_tree(tree)

    
    return final_urls

print(f"Final URLs: {extract_final_urls('https://northlightai.com')}")
print(f"Final URLs: {extract_final_urls('https://www.northlightai.com/')}")
print(f"Final URLs: {extract_final_urls('https://northlightai.com/sitemap.xml')}")

print(f"There are {len(extract_final_urls('https://northlightai.com/sitemap.xml'))} links ready to be scraped.")

# The outout here is an input for the scraper or the visualizer.

Final URLs: ['https://www.northlightai.com', 'https://northlightai.com/publications', 'https://northlightai.com/faq', 'https://northlightai.com/our-customers-1', 'https://northlightai.com/', 'https://northlightai.com/ai-strategic-consulting', 'https://northlightai.com/ai-innovation-lab', 'https://northlightai.com/our-team', 'https://northlightai.com/higher-education', 'https://northlightai.com/product-development', 'https://northlightai.com/north-light-values', 'https://northlightai.com/ai-training-%26-development-1', 'https://northlightai.com/technical-partners', 'https://northlightai.com/ai-literacy-sponsorships', 'https://northlightai.com/m/login', 'https://northlightai.com/m/reset', 'https://northlightai.com/m/create', 'https://northlightai.com/m/create-account', 'https://northlightai.com/publications/f/north-light-ai-partners-with-unh-to-launch-affordable-ai-training', 'https://northlightai.com/publications/f/openais-new-o1-strawberry-model-released', 'https://northlightai.com/pub

# Scraper

In [8]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time


HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        if text:
            return text
    except:
        pass

    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        time.sleep(3)
        elems = driver.find_elements(By.TAG_NAME, "p")
        text = ' '.join(elem.text for elem in elems).strip()
        driver.quit()
        return text
    except:
        return ""
    
text = scrape("https://northlightai.com")
print(f"Scraped text: {text[:100]}...")  # Print the first 100 characters of the scraped text
print(f"len(text): {len(text)} characters")

# The output here is an input for the RAG model.

Scraped text: AI Strategic Consulting Services | North Light AI Home Our Services AI Training & Development AI Lit...
len(text): 2972 characters


# Visualizer

In [9]:
import networkx as nx
from pyvis.network import Network
import json

def tree_to_edges(tree, parent=None):
    edges = []
    if isinstance(tree, list):
        for item in tree:
            edges.append((parent, item))
    elif isinstance(tree, dict):
        for key, value in tree.items():
            if parent:
                edges.append((parent, key))
            edges += tree_to_edges(value, parent=key)
    return edges

def generate_graph(sitemap_url, output_file="sitemap_network.html", json_filename=None):
    tree = parse_sitemap(sitemap_url)

    # Default name if not provided
    if json_filename is None:
        json_filename = "sitemap_tree.json"

    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(tree, f, indent=2)

    edges = tree_to_edges(tree[sitemap_url], parent=sitemap_url)

    G = nx.DiGraph()
    G.add_edges_from(edges)
    net = Network(height="750px", width="100%", directed=True, notebook=False)

    for node in G.nodes():
        if node == sitemap_url:
            net.add_node(node, label=str(urlparse(sitemap_url).netloc), title=node, shape='dot', size=30,
                         color={"background": "white", "border": "blue"}, borderWidth=4,
                         font={"color": "black", "size": 35, "bold": True})
        elif node.endswith('.xml'):
            net.add_node(node, label="Sitemap", title=node, shape='dot', size=25)
        else:
            net.add_node(node, label=" ", title=node, shape='dot', size=15,
                         color={"background": "#ccffcc", "border": "#009933"})

    for source, target in G.edges():
        net.add_edge(source, target)

    net.force_atlas_2based()
    net.set_options("""
    { "physics": { "stabilization": false }, "interaction": { "dragNodes": true } }
    """)
    net.write_html(output_file)

    # Read the generated HTML
    with open(output_file, "r", encoding="utf-8") as f:
        html = f.read()

    # Inject JS
    inject_js = f"""
    <script type="text/javascript">
    window.addEventListener("load", function () {{
        // ✅ Hide the Pyvis loading bar if it exists
        const loader = document.getElementById("loadingBar");
        if (loader) loader.style.display = "none";

        const rootNodeId = "{sitemap_url}";
        const titleNodeId = "graph_title";
        const originalLabels = {{}};

        network.on("click", function (params) {{
        if (params.nodes.length > 0) {{
            let clickedNodeId = params.nodes[0];

            nodes.get().forEach(function (node) {{
            if (!(node.id in originalLabels)) {{
                originalLabels[node.id] = node.label;
            }}

            if (node.id === rootNodeId || node.id === titleNodeId) {{
                return;
            }}

            if (node.id === clickedNodeId) {{
                nodes.update({{id: node.id, label: node.title}});
            }} else {{
                nodes.update({{id: node.id, label: ""}});
            }}
            }});
        }} else {{
            nodes.get().forEach(function (node) {{
            if (node.id !== rootNodeId && node.id !== titleNodeId && originalLabels[node.id] === "") {{
                nodes.update({{id: node.id, label: ""}});
            }}
            }});
        }}
        }});
    }});
    </script>
    """

    # Inject before </body>
    html = html.replace("</body>", inject_js + "\\n</body>")

    # Save it back
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"Output files successfully generated: {output_file} and {json_filename}")
    return output_file

sitemap_url = 'https://northlightai.com/sitemap.xml' # This should receive .xml URLs
output_file = generate_graph(sitemap_url, output_file="output/sitemap_network.html", json_filename="output/sitemap_tree.json")



Output files successfully generated: output/sitemap_network.html and output/sitemap_tree.json


# RAG model (using Langchain)
## Step 1) Indexing
### 1.1) Load

In [19]:
import getpass
import os
import dotenv
# Load environment variables from .env file
dotenv.load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
else:
  print(f"Openai API key successfully imported from .env file.")
  #print(f"Key: {os.environ.get('OPENAI_API_KEY')}")

Openai API key successfully imported from .env file.


### Setup

In [20]:
# Chat Model
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

# Embedding Model
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Vector Store-In memory
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

# Vector Store-Supabase
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings  # or any other embedding model
from supabase import create_client, Client

# Initialize Supabase client

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# Embeddings
embedding = OpenAIEmbeddings()

# Create or connect to Supabase vector store
vector_store = SupabaseVectorStore(
    client=supabase,
    embedding=embedding,
    table_name="documents",  # You can change this
    query_name="match_documents"  # Needs to be created in Supabase SQL
)
