In [None]:
from bs4 import BeautifulSoup
import os
import requests

In [None]:
def save_html(url: str, filename: str) -> None:
    """
    Saves the HTML content of a URL to a file under ../data directory.

    Args:
    url (str): The URL of the webpage to save.
    filename (str): The name of the file to save the HTML content to.

    Returns:
    None
    """
    # Create the data directory if it doesn't exist
    if not os.path.exists('../data'):
        os.makedirs('../data')

    # Get the HTML content of the URL
    response = requests.get(url)
    html_content = response.content

    # Save the HTML content to a file
    with open(f'../data/{filename}', 'wb') as f:
        f.write(html_content)

In [None]:
def get_links(filename: str) -> list:
    """
    Returns a list of all links in <a> tags in an HTML file.

    Args:
    filename (str): The name of the HTML file to parse.

    Returns:
    list: A list of all links in <a> tags in the HTML file.
    """
    # Open the HTML file and create a BeautifulSoup object
    with open(filename, 'r', encoding="latin-1") as f:
        html_doc = f.read()
    soup = BeautifulSoup(html_doc, 'html.parser')

    # Find all <a> tags and extract the href attribute
    links = []
    for a_tag in soup.find_all('a'):
        link = a_tag.get('href')
        if link:
            links.append(link)

    return links

In [None]:
class PageNode:
    def __init__(self, name:str, page_from:set() = set(), page_to:set() = set()) -> None:
        self._name = name
        self._page_from = page_from
        self._page_to = page_to

    @property
    def name(self) -> str:
        return self._name
    
    def __str__(self) -> str:
        return self.__repr__()
    
    def __repr__(self) -> str:
        return self._name
    
    def add_pages_to(self, pages:set()) -> None:
        self._page_to.update(pages)

In [None]:
prefix = "http://www.cibc.com"
initial_url = "/en/personal-banking.html"
filter_str = "/en/personal-banking"
#all_links = {"_personal-banking.html": {"name": "/personal-banking.html", "scraped": False, "page_from": set(), "page_to": set()}}
graph = {initial_url : PageNode(initial_url)}
to_visit = set({initial_url})
visited = set()

In [None]:
prefix = "http://www.cibc.com"

In [None]:

current = to_visit.pop()

save_html(f'{prefix}{current}', current.replace('/', '_'))

visited.add(current)

page_links = get_links(f'../data/{current.replace("/", "_")}')
page_links = {link for link in page_links if filter_str in link}
page_links = {link for link in page_links if link not in visited}
page_links = {link for link in page_links if link not in to_visit}
graph[current].add_pages_to(page_links)

# find which files have been collected
# collected_files = os.listdir('../data')
# if current not in collected_files:
#     save_html('/personal-banking.html', i)
#     page_links = set(get_links(f'../data/{current}'))


In [None]:
for i in graph[current]._page_to:
    if i not in visited:
        if i not in to_visit:
            graph[i] = PageNode(i)
            to_visit.add(i)

In [None]:
len(to_visit), len(visited)

In [None]:
current

In [None]:
while len(to_visit) > 0:
    current = to_visit.pop()
    print("Visiting:", current)
    tmp = ""
    if not current.startswith("http"):
        tmp = prefix
    save_html(tmp + current, current.replace('/', '_'))

    visited.add(current)

    page_links = get_links(f'../data/{current.replace("/", "_")}')
    page_links = {link for link in page_links if filter_str in link}
    page_links = {link for link in page_links if link not in visited}
    page_links = {link for link in page_links if link not in to_visit}
    page_links = {link for link in page_links if link.endswith('.html')}
    graph[current].add_pages_to(page_links)

    for i in graph[current]._page_to:
        if i not in visited:
            if i not in to_visit:
                graph[i] = PageNode(i)
                to_visit.add(i)
    
    print("Number of pages to visit:", len(to_visit), "\nNumber of Pages Visited:", len(visited))
    