# Document Splitting

In [19]:
import os
import openai
import sys
sys.path.append('../..')

from py3810.myUtils import pickle_dump, pickle_load
path_lumen_docs = '..\langchain\docs\lumen\\docs\\'

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv('.env\my_api_key.env')) # read local .env file

SECRET_KEY1 = os.environ.get("SECRET_KEY")
DATABASE_PASSWORD2 = os.environ.get("DATABASE_PASSWORD")
print(f"SECRET_KEY = {SECRET_KEY1}")
print(f"DATABASE_PASSWORD = {DATABASE_PASSWORD2}")

openai.api_key  = os.environ['openai_api']

SECRET_KEY = my_secret_key
DATABASE_PASSWORD = my_db_pwd


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [None]:
chunk_size =26
chunk_overlap = 4

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

Why doesn't this split the string below?

In [None]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [None]:
r_splitter.split_text(text1)

In [None]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [None]:
r_splitter.split_text(text2)

Ok, this splits the string but we have an overlap specified as 5, but it looks like 3? (try an even number)

In [None]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [None]:
r_splitter.split_text(text3)

In [None]:
c_splitter.split_text(text3)

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

Try your own examples!

## Recursive splitting details

`RecursiveCharacterTextSplitter` is recommended for generic text. 

In [None]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [None]:
len(some_text)

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
c_splitter.split_text(some_text)

In [None]:
r_splitter.split_text(some_text)

Let's reduce the chunk size a bit and add a period to our separators:

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
len(docs)

In [None]:
len(pages)

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
notion_db = loader.load()

In [None]:
docs = text_splitter.split_documents(notion_db)

In [None]:
len(notion_db)

In [None]:
len(docs)

## Token splitting

We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [None]:
from langchain.text_splitter import TokenTextSplitter

In [None]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [None]:
text1 = "foo bar bazzyfoo"

In [None]:
text_splitter.split_text(text1)

In [None]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
docs[0]

In [None]:
pages[0].metadata

## Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use `MarkdownHeaderTextSplitter` to preserve header metadata in our chunks, as show below.

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [None]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [None]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [None]:
md_header_splits[0]

In [None]:
md_header_splits[1]

Try on a real Markdown file, like a Notion database.

In [None]:
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [None]:
md_header_splits = markdown_splitter.split_text(txt)

In [None]:
md_header_splits[0]

In [1]:
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

In [2]:
def remove_duplicates(my_list):
  """
  Removes duplicate items from a list while preserving order.

  Args:
      my_list: A list with potential duplicates.

  Returns:
      A new list with duplicates removed, preserving the original order.
  """
  seen = set()
  unique_list = []
  for item in my_list:
    if item not in seen:
      seen.add(item)
      unique_list.append(item)
  return unique_list

# Example usage
my_list = [1, 2, 2, 3, 4, 1]
unique_list = remove_duplicates(my_list)
print(unique_list)  # Output: [1, 2, 3, 4] (Order may vary)

[1, 2, 3, 4]


In [3]:
from pyrsistent import v
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

def list_website_links(url, verbose=False):
  l_hrefs = []
  l_valid_hrefs = []  
  """Fetches the HTML content of a website and extracts all links."""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  links = soup.find_all('a')
  
  for link in links:
    # Get the href attribute which contains the link URL
    href = link.get('href')
    # Check if the link exists and avoid empty links
    if href:
      # print(f'href: {href}')
      l_hrefs.append(href)      
      if url_validator(href):
        l_valid_hrefs.append(href)
        # print(f'valid_urls: {href}')

  if verbose:
    print(f'valid hrefs, len={len(l_valid_hrefs)}:')
    for valid_href in l_valid_hrefs:
      print(valid_href)
    print(f'\nall hrefs, len={len(l_hrefs)}:')
    for href in l_hrefs:
      print(href)       

  return l_valid_hrefs, l_hrefs       

# # Example usage
# website_url = "https://www.lumenoptometric.com"
# l_valid_hrefs, l_hrefs  = list_website_links(website_url, verbose=True)


In [50]:
import requests
from bs4 import BeautifulSoup

def find_youtube_links(url):
  """
  This function takes a URL as input and returns a list of YouTube video links found on the webpage.

  Args:
      url: The URL of the webpage to scrape.

  Returns:
      A list of YouTube video links found on the webpage.
  """
  try:
    # Get the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all iframe tags (common for embedding videos)
    iframes = soup.find_all('iframe')

    # Extract YouTube video links from iframes
    youtube_links = []
    for iframe in iframes:
      # Check if iframe source points to YouTube
      if 'youtube.com/embed/' in iframe.get('src', ''):
        youtube_links.append(iframe['src'])

    return youtube_links

  except requests.exceptions.RequestException as e:
    print(f"Error: {e}")
    return []

# Example usage
target_url = "https://www.lumenoptometric.com/"  # Replace with the target webpage URL
youtube_links = find_youtube_links(target_url)

if youtube_links:
  print("Found YouTube links:")
  for link in youtube_links:
    print(link)
else:
  print("No YouTube links found on the webpage.")


No YouTube links found on the webpage.


In [59]:
valid_urls = []
yt_links = []
website_url = "https://www.lumenoptometric.com"
_urls, _  = list_website_links(website_url, verbose=False)
valid_urls = valid_urls + _urls
valid_urls = remove_duplicates(valid_urls)
print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')

_yt_links = find_youtube_links(_url)
yt_links = yt_links + _yt_links
yt_links = remove_duplicates(yt_links)
print(f'yt_links, len:{len(yt_links)}:\n{yt_links}')

valid_urls, len:34:
['https://www.lumenoptometric.com', 'https://scheduleyourexam.com/v3/index.php/4784/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.com/

In [33]:
# website_url = "https://www.lumenoptometric.com/blog/"
# _urls, _  = list_website_links(website_url, verbose=False)

In [62]:
for _url in _urls:
  _urls = find_substring(_urls, 'https://www.lumenoptometric.com')  
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  valid_urls = valid_urls + _tmp_urls
  valid_urls = find_substring(valid_urls, 'https://www.lumenoptometric.com')    
  valid_urls = remove_duplicates(valid_urls)

  _yt_links = find_youtube_links(_url)
  yt_links = yt_links + _yt_links
  yt_links = remove_duplicates(yt_links)    

print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
print(f'yt_links, len:{len(yt_links)}:\n{yt_links}')
_urls = _tmp_urls

# print(f'_tmp_urls, len:{len(_tmp_urls)}:\n{_tmp_urls}')

# _valid_urls  = [item for sublist in _valid_urls for item in sublist]  # flatten
# _valid_urls

valid_urls, len:105:
['https://www.lumenoptometric.com', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.com/orthokeratology/is-ortho-k-safe/', 'https://www.lu

In [66]:
sorted_yt_links = sorted(yt_links)
sorted_yt_links 

['https://www.youtube.com/embed/-UMk6KKcT_c?feature=oembed&wmode=opaque',
 'https://www.youtube.com/embed/6xgYtUvPpX0',
 'https://www.youtube.com/embed/CoukXsJKjv8',
 'https://www.youtube.com/embed/DhtuaIazLIw',
 'https://www.youtube.com/embed/H4GfHJCOcAk?rel=0',
 'https://www.youtube.com/embed/HfdIB-5bP-0',
 'https://www.youtube.com/embed/IRHtQQVethk',
 'https://www.youtube.com/embed/JHNzfnt9xZM?feature=oembed&wmode=opaque',
 'https://www.youtube.com/embed/KOpBVoqeeTE?feature=oembed&wmode=opaque',
 'https://www.youtube.com/embed/Lq7LmBjatMA',
 'https://www.youtube.com/embed/Lq7LmBjatMA?rel=0',
 'https://www.youtube.com/embed/Nb_PLkwmTl8',
 'https://www.youtube.com/embed/Nb_PLkwmTl8?rel=0',
 'https://www.youtube.com/embed/RIoA_bIo8hE?feature=oembed&wmode=opaque',
 'https://www.youtube.com/embed/UcZrM2L1h8w',
 'https://www.youtube.com/embed/UmMYehZNlc0',
 'https://www.youtube.com/embed/UmMYehZNlc0?rel=0',
 'https://www.youtube.com/embed/abUT6U9cXrI',
 'https://www.youtube.com/embed/dk96

In [42]:
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  valid_urls = valid_urls + _tmp_urls
  valid_urls = remove_duplicates(valid_urls)

print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
_urls = _tmp_urls

# print(f'_tmp_urls, len:{len(_tmp_urls)}:\n{_tmp_urls}')

# _valid_urls  = [item for sublist in _valid_urls for item in sublist]  # flatten
# _valid_urls

valid_urls, len:1432:
['https://www.lumenoptometric.com', 'https://scheduleyourexam.com/v3/index.php/4784/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.co

In [43]:
sorted_valid_urls = sorted(valid_urls)
print(f'len(sorted_valid_urls): {len(sorted_valid_urls)}')
sorted_valid_urls

len(sorted_valid_urls): 1432


['http://scheduleyourexam.com',
 'http://stackoverflow.com/questions/ask?tags=youtube-api',
 'http://twitter.com/share?url=https://about.google/intl/en_us/philosophy/%3Futm_source%3Dtwitter.com%26utm_medium%3Dsocial%26utm_campaign%3Dstory-share&text=%40Google',
 'http://www.facebook.com/sharer.php?u=https://about.google/intl/en_us/philosophy/%3Futm_source%3Dfacebook.com%26utm_medium%3Dsocial%26utm_campaign%3Dstory-share',
 'http://www.google.com/ads/coupons/terms.html',
 'http://www.google.com/appsstatus',
 'http://www.google.com/history/optout?hl=en',
 'http://www.linkedin.com/shareArticle?mini=true&url=https://about.google/intl/en_us/philosophy/%3Futm_source%3Dlinkedin.com%26utm_medium%3Dsocial%26utm_campaign%3Dstory-share',
 'http://www.youtube.com/movies',
 'http://www.youtube.com/yt/copyright/',
 'http://yelp-ir.com/',
 'http://youtube.com/t/terms',
 'http://youtube.com/trends',
 'https://abc.xyz/investor/',
 'https://abc.xyz/investor/?utm_source=about&utm_medium=referral&utm_camp

In [None]:
[item.replace(old_text, new_text) for item in text_list]

In [13]:
def find_substring(text_list, substring):
  """
  Searches a list of text strings for a given substring and returns a new list with matches.

  Args:
      text_list: A list of strings.
      substring: The substring to search for.

  Returns:
      A new list containing all strings from text_list that have the specified substring.
  """
  return [text for text in text_list if substring in text]  # Case sensitive search

# Example usage
text_list = ["This has abc", "another string", "no xyz here", "AbCdEf", "fdsabcieojdsl", "fdsab cieojdsl"]
matches_abc = find_substring(text_list, substring="abc")
print(matches_abc)  # Output: ["This has abc", "AbCdEf"]

matches_xyz = find_substring(text_list, substring="xyz")
print(matches_xyz)  # Output: ["no xyz here"]



['This has abc', 'fdsabcieojdsl']
['no xyz here']


In [44]:
lumen_urls = find_substring(sorted_valid_urls, 'https://www.lumenoptometric.com')
print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls

lumen_urls, len=266


['https://www.lumenoptometric.com',
 'https://www.lumenoptometric.com/',
 'https://www.lumenoptometric.com/about-us/',
 'https://www.lumenoptometric.com/about-us/appointments-forms/',
 'https://www.lumenoptometric.com/about-us/gallery/',
 'https://www.lumenoptometric.com/about-us/office/',
 'https://www.lumenoptometric.com/about-us/testimonials/',
 'https://www.lumenoptometric.com/about-us/testimonials/page/2/',
 'https://www.lumenoptometric.com/blog/',
 'https://www.lumenoptometric.com/blog/2020/',
 'https://www.lumenoptometric.com/blog/2020/03/',
 'https://www.lumenoptometric.com/blog/2020/04/',
 'https://www.lumenoptometric.com/blog/2020/05/',
 'https://www.lumenoptometric.com/blog/2020/06/',
 'https://www.lumenoptometric.com/blog/2020/07/',
 'https://www.lumenoptometric.com/blog/2020/08/',
 'https://www.lumenoptometric.com/blog/2020/09/',
 'https://www.lumenoptometric.com/blog/2020/10/',
 'https://www.lumenoptometric.com/blog/2020/11/',
 'https://www.lumenoptometric.com/blog/2020/1

In [45]:
scraped_lumen_urls = pickle_load(filename_pickle='scraped_lumen_urls', path_pickle_dump=path_lumen_docs)
print(f'len(scraped_lumen_urls): {len(scraped_lumen_urls)}')
scraped_lumen_urls = find_substring(scraped_lumen_urls, 'https://www.lumenoptometric.com')
print(f'len(scraped_lumen_urls): {len(scraped_lumen_urls)}')

len(scraped_lumen_urls): 431
len(scraped_lumen_urls): 431


In [46]:
url_dif = list(set(scraped_lumen_urls) - set(lumen_urls))
sorted_url_dif = sorted(url_dif)
print(f'len(sorted_url_dif): {len(sorted_url_dif)}')
sorted_url_dif

len(sorted_url_dif): 165


['https://www.lumenoptometric.com/blog/2020/page/2/',
 'https://www.lumenoptometric.com/blog/2020/page/3/',
 'https://www.lumenoptometric.com/blog/2020/page/5/',
 'https://www.lumenoptometric.com/blog/2021/page/2/',
 'https://www.lumenoptometric.com/blog/2021/page/3/',
 'https://www.lumenoptometric.com/blog/2021/page/6/',
 'https://www.lumenoptometric.com/blog/2022/page/2/',
 'https://www.lumenoptometric.com/blog/2022/page/3/',
 'https://www.lumenoptometric.com/blog/2022/page/6/',
 'https://www.lumenoptometric.com/blog/2023/page/2/',
 'https://www.lumenoptometric.com/blog/2023/page/3/',
 'https://www.lumenoptometric.com/blog/2023/page/6/',
 'https://www.lumenoptometric.com/blog/2024/page/2/',
 'https://www.lumenoptometric.com/blog/author/alyssab/',
 'https://www.lumenoptometric.com/blog/author/lumen-optometrics/page/2/',
 'https://www.lumenoptometric.com/blog/author/lumen-optometrics/page/23/',
 'https://www.lumenoptometric.com/blog/author/lumen-optometrics/page/3/',
 'https://www.lume

In [None]:
_urls = lumen_urls
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  lumen_urls = lumen_urls + _tmp_urls
  lumen_urls = remove_duplicates(lumen_urls)

print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls
# print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
# _urls = _tmp_urls

In [None]:
_lumen_urls = find_substring(lumen_urls, 'https://www.lumenoptometric.com')
_lumen_urls = remove_duplicates(_lumen_urls)
print(f'_lumen_urls, len={len(_lumen_urls)}')
_lumen_urls

In [None]:
pickle_dump(file_to_pickle=_lumen_urls, filename_pickle='scraped_lumen_urls', path_pickle_dump=path_lumen_docs)

In [None]:
_urls = lumen_urls
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  lumen_urls = lumen_urls + _tmp_urls
  lumen_urls = remove_duplicates(lumen_urls)

print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls
# print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
# _urls = _tmp_urls

In [None]:
print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')

In [None]:
_valid_urls  = [item for sublist in _valid_urls for item in sublist]  # flatten
_valid_urls

In [None]:
flatten_valid_hrefs = [item for sublist in valid_hrefs for item in sublist]
flatten_valid_hrefs 

In [None]:
len(flatten_valid_hrefs)

In [None]:
flatten_valid_hrefs = flatten_valid_hrefs.append(l_valid_hrefs_0)

In [None]:
len(flatten_valid_hrefs)

flatten_valide_hrefs = flatten_valide_hrefs.append(l_valid_hrefs_0)

In [None]:
unique_valid_hrefs = list(set(flatten_valid_hrefs))
len(unique_valid_hrefs)
unique_valid_hrefs

In [None]:
unique_valid_hrefs.sort()
unique_valid_hrefs

In [None]:
for _href in l_valid_hrefs:
  l_valid_hrefs1, l_hrefs1  = list_website_links(_href, verbose=True)

In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com"
l_valid_hrefs, l_hrefs  = list_website_links(website_url)

In [None]:
l_hrefs

In [None]:
l_valid_hrefs

In [None]:
import requests
from bs4 import BeautifulSoup

def list_website_links(url):
  l_links = []
  """Fetches the HTML content of a website and extracts all links."""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  links = soup.find_all('a')
  
  for link in links:
    # Get the href attribute which contains the link URL
    href = link.get('href')
    # Check if the link exists and avoid empty links
    if href:
      print(href)
      l_links.append(href)
      
  return l_links      

# # Example usage
# website_url = "https://www.lumenoptometric.com/"
# result_links = list_website_links(website_url)


In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com"
l_links = list_website_links(website_url)

In [None]:
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

In [None]:
valid_urls = []
for i, url in enumerate(l_links):
  # print(f'{i}, url')
  if url_validator(url):
    valid_urls.append(url)
  # else:
  #   print(f'not a valid url: {url}')  

In [None]:
valid_urls

In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/"
list_website_links(website_url)

In [None]:
website_url = "https://www.lumenoptometric.com/blog/"
list_website_links(website_url)