# Document Splitting

In [37]:
import os
import openai
import sys
sys.path.append('../..')

from py3810.myUtils import pickle_dump, pickle_load
path_lumen_docs = '..\langchain\docs\lumen\\docs\\'

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv('.env\my_api_key.env')) # read local .env file

SECRET_KEY1 = os.environ.get("SECRET_KEY")
DATABASE_PASSWORD2 = os.environ.get("DATABASE_PASSWORD")
print(f"SECRET_KEY = {SECRET_KEY1}")
print(f"DATABASE_PASSWORD = {DATABASE_PASSWORD2}")

openai.api_key  = os.environ['openai_api']

SECRET_KEY = my_secret_key
DATABASE_PASSWORD = my_db_pwd


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [None]:
chunk_size =26
chunk_overlap = 4

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

Why doesn't this split the string below?

In [None]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [None]:
r_splitter.split_text(text1)

In [None]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [None]:
r_splitter.split_text(text2)

Ok, this splits the string but we have an overlap specified as 5, but it looks like 3? (try an even number)

In [None]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [None]:
r_splitter.split_text(text3)

In [None]:
c_splitter.split_text(text3)

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

Try your own examples!

## Recursive splitting details

`RecursiveCharacterTextSplitter` is recommended for generic text. 

In [None]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [None]:
len(some_text)

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
c_splitter.split_text(some_text)

In [None]:
r_splitter.split_text(some_text)

Let's reduce the chunk size a bit and add a period to our separators:

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
len(docs)

In [None]:
len(pages)

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
notion_db = loader.load()

In [None]:
docs = text_splitter.split_documents(notion_db)

In [None]:
len(notion_db)

In [None]:
len(docs)

## Token splitting

We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [None]:
from langchain.text_splitter import TokenTextSplitter

In [None]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [None]:
text1 = "foo bar bazzyfoo"

In [None]:
text_splitter.split_text(text1)

In [None]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
docs[0]

In [None]:
pages[0].metadata

## Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use `MarkdownHeaderTextSplitter` to preserve header metadata in our chunks, as show below.

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [None]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [None]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [None]:
md_header_splits[0]

In [None]:
md_header_splits[1]

Try on a real Markdown file, like a Notion database.

In [None]:
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [None]:
md_header_splits = markdown_splitter.split_text(txt)

In [None]:
md_header_splits[0]

In [5]:
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

In [4]:
def remove_duplicates(my_list):
  """
  Removes duplicate items from a list while preserving order.

  Args:
      my_list: A list with potential duplicates.

  Returns:
      A new list with duplicates removed, preserving the original order.
  """
  seen = set()
  unique_list = []
  for item in my_list:
    if item not in seen:
      seen.add(item)
      unique_list.append(item)
  return unique_list

# Example usage
my_list = [1, 2, 2, 3, 4, 1]
unique_list = remove_duplicates(my_list)
print(unique_list)  # Output: [1, 2, 3, 4] (Order may vary)

[1, 2, 3, 4]


In [2]:
from pyrsistent import v
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

def list_website_links(url, verbose=False):
  l_hrefs = []
  l_valid_hrefs = []  
  """Fetches the HTML content of a website and extracts all links."""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  links = soup.find_all('a')
  
  for link in links:
    # Get the href attribute which contains the link URL
    href = link.get('href')
    # Check if the link exists and avoid empty links
    if href:
      # print(f'href: {href}')
      l_hrefs.append(href)      
      if url_validator(href):
        l_valid_hrefs.append(href)
        # print(f'valid_urls: {href}')

  if verbose:
    print(f'valid hrefs, len={len(l_valid_hrefs)}:')
    for valid_href in l_valid_hrefs:
      print(valid_href)
    print(f'\nall hrefs, len={len(l_hrefs)}:')
    for href in l_hrefs:
      print(href)       

  return l_valid_hrefs, l_hrefs       

# # Example usage
# website_url = "https://www.lumenoptometric.com"
# l_valid_hrefs, l_hrefs  = list_website_links(website_url, verbose=True)


In [6]:
valid_urls = []
website_url = "https://www.lumenoptometric.com"
_urls, _  = list_website_links(website_url, verbose=False)
valid_urls = valid_urls + _urls
valid_urls = remove_duplicates(valid_urls)
print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')

valid_urls, len:34:
['https://www.lumenoptometric.com', 'https://scheduleyourexam.com/v3/index.php/4784/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.com/

In [9]:
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  valid_urls = valid_urls + _tmp_urls
  valid_urls = remove_duplicates(valid_urls)

print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
_urls = _tmp_urls

# print(f'_tmp_urls, len:{len(_tmp_urls)}:\n{_tmp_urls}')

# _valid_urls  = [item for sublist in _valid_urls for item in sublist]  # flatten
# _valid_urls

valid_urls, len:536:
['https://www.lumenoptometric.com', 'https://scheduleyourexam.com/v3/index.php/4784/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.com

In [10]:
valid_urls

['https://www.lumenoptometric.com',
 'https://scheduleyourexam.com/v3/index.php/4784/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/',
 'https://www.lumenoptometric.com/scleral-lenses/',
 'https://www.lumenoptometric.com/poseyedon-lens/',
 'https://www.lumenoptometric.com/neurolens-therapy/',
 'https://www.lumenoptometric.com/orthokeratology/',
 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/',
 'https://www.lumenoptometric.com/orthoke

In [11]:
sorted_valid_urls = sorted(valid_urls)
sorted_valid_urls

['http://scheduleyourexam.com',
 'http://stackoverflow.com/questions/ask?tags=youtube-api',
 'http://www.google.com/ads/coupons/terms.html',
 'http://www.youtube.com/movies',
 'http://yelp-ir.com/',
 'http://youtube.com/t/terms',
 'http://youtube.com/trends',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://support.google.com/youtube/answer/12770587%3Fsjid%3D5082629036510608231-NA%26visit_id%3D638492711911084891-4026828709%26rd%3D1&ec=GAZAdQ',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://support.google.com/youtube/answer/7582560%3Fhl%3Den&ec=GAZAdQ',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://support.google.com/youtube/answer/78358%3Fvisit_id%3D638288444132439115-554885077%26p%3DNFL_Sunday_Ticket_YT_Devices%26rd%3D1&ec=GAZAdQ',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://support.google.com/youtubetv/answer/10057534&ec=GAZAdQ',
 'https://accounts.google.com/

In [12]:
def find_substring(text_list, substring):
  """
  Searches a list of text strings for a given substring and returns a new list with matches.

  Args:
      text_list: A list of strings.
      substring: The substring to search for.

  Returns:
      A new list containing all strings from text_list that have the specified substring.
  """
  return [text for text in text_list if substring in text]  # Case sensitive search

# Example usage
text_list = ["This has abc", "another string", "no xyz here", "AbCdEf", "fdsabcieojdsl", "fdsab cieojdsl"]
matches_abc = find_substring(text_list, substring="abc")
print(matches_abc)  # Output: ["This has abc", "AbCdEf"]

matches_xyz = find_substring(text_list, substring="xyz")
print(matches_xyz)  # Output: ["no xyz here"]



['This has abc', 'fdsabcieojdsl']
['no xyz here']


In [13]:
lumen_urls = find_substring(sorted_valid_urls, 'https://www.lumenoptometric.com')
print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls

lumen_urls, len=105


['https://www.lumenoptometric.com',
 'https://www.lumenoptometric.com/',
 'https://www.lumenoptometric.com/about-us/',
 'https://www.lumenoptometric.com/about-us/appointments-forms/',
 'https://www.lumenoptometric.com/about-us/gallery/',
 'https://www.lumenoptometric.com/about-us/office/',
 'https://www.lumenoptometric.com/about-us/testimonials/',
 'https://www.lumenoptometric.com/about-us/testimonials/page/2/',
 'https://www.lumenoptometric.com/blog/',
 'https://www.lumenoptometric.com/blog/2020/03/',
 'https://www.lumenoptometric.com/blog/2020/04/',
 'https://www.lumenoptometric.com/blog/2020/05/',
 'https://www.lumenoptometric.com/blog/2020/06/',
 'https://www.lumenoptometric.com/blog/2020/07/',
 'https://www.lumenoptometric.com/blog/2020/08/',
 'https://www.lumenoptometric.com/blog/2020/09/',
 'https://www.lumenoptometric.com/blog/2020/10/',
 'https://www.lumenoptometric.com/blog/2020/11/',
 'https://www.lumenoptometric.com/blog/2020/12/',
 'https://www.lumenoptometric.com/blog/202

In [94]:
_urls = lumen_urls
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  lumen_urls = lumen_urls + _tmp_urls
  lumen_urls = remove_duplicates(lumen_urls)

print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls
# print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
# _urls = _tmp_urls

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


lumen_urls, len=377


['https://www.instagram.com/lumenoptometric/',
 'https://www.lumenoptometric.com',
 'https://www.lumenoptometric.com/',
 'https://www.lumenoptometric.com/about-us/',
 'https://www.lumenoptometric.com/about-us/appointments-forms/',
 'https://www.lumenoptometric.com/about-us/gallery/',
 'https://www.lumenoptometric.com/about-us/office/',
 'https://www.lumenoptometric.com/about-us/testimonials/',
 'https://www.lumenoptometric.com/about-us/testimonials/page/2/',
 'https://www.lumenoptometric.com/blog/',
 'https://www.lumenoptometric.com/blog/2020/03/',
 'https://www.lumenoptometric.com/blog/2020/04/',
 'https://www.lumenoptometric.com/blog/2020/05/',
 'https://www.lumenoptometric.com/blog/2020/06/',
 'https://www.lumenoptometric.com/blog/2020/07/',
 'https://www.lumenoptometric.com/blog/2020/08/',
 'https://www.lumenoptometric.com/blog/2020/09/',
 'https://www.lumenoptometric.com/blog/2020/10/',
 'https://www.lumenoptometric.com/blog/2020/11/',
 'https://www.lumenoptometric.com/blog/2020/1

In [100]:
_lumen_urls = find_substring(lumen_urls, 'https://www.lumenoptometric.com')
_lumen_urls = remove_duplicates(_lumen_urls)
print(f'_lumen_urls, len={len(_lumen_urls)}')
_lumen_urls

_lumen_urls, len=431


['https://www.lumenoptometric.com',
 'https://www.lumenoptometric.com/',
 'https://www.lumenoptometric.com/about-us/',
 'https://www.lumenoptometric.com/about-us/appointments-forms/',
 'https://www.lumenoptometric.com/about-us/gallery/',
 'https://www.lumenoptometric.com/about-us/office/',
 'https://www.lumenoptometric.com/about-us/testimonials/',
 'https://www.lumenoptometric.com/about-us/testimonials/page/2/',
 'https://www.lumenoptometric.com/blog/',
 'https://www.lumenoptometric.com/blog/2020/03/',
 'https://www.lumenoptometric.com/blog/2020/04/',
 'https://www.lumenoptometric.com/blog/2020/05/',
 'https://www.lumenoptometric.com/blog/2020/06/',
 'https://www.lumenoptometric.com/blog/2020/07/',
 'https://www.lumenoptometric.com/blog/2020/08/',
 'https://www.lumenoptometric.com/blog/2020/09/',
 'https://www.lumenoptometric.com/blog/2020/10/',
 'https://www.lumenoptometric.com/blog/2020/11/',
 'https://www.lumenoptometric.com/blog/2020/12/',
 'https://www.lumenoptometric.com/blog/202

In [101]:
pickle_dump(file_to_pickle=_lumen_urls, filename_pickle='scraped_lumen_urls', path_pickle_dump=path_lumen_docs)

In [96]:
_urls = lumen_urls
for _url in _urls:
  _tmp_urls, _ = list_website_links(_url, verbose=False)
  lumen_urls = lumen_urls + _tmp_urls
  lumen_urls = remove_duplicates(lumen_urls)

print(f'lumen_urls, len={len(lumen_urls)}')
lumen_urls
# print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')
# _urls = _tmp_urls

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

InvalidSchema: No connection adapters were found for 'mailto:?subject=Can%20Myopia%20Worsen%20Despite%20Corrected%20Visual%20Habits&body=https%3A%2F%2Fwww.lumenoptometric.com%2Fblog%2Feye-care%2Fcan-myopia-worsen-despite-corrected-visual-habits%2F'

In [74]:
print(f'valid_urls, len:{len(valid_urls)}:\n{valid_urls}')

valid_urls, len:2:
[['https://www.lumenoptometric.com', 'https://scheduleyourexam.com/v3/index.php/4784/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/', 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/', 'https://www.lumenoptometric.com/scleral-lenses/', 'https://www.lumenoptometric.com/poseyedon-lens/', 'https://www.lumenoptometric.com/neurolens-therapy/', 'https://www.lumenoptometric.com/orthokeratology/', 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/', 'https://www.lumenoptometric.com/

In [68]:
_valid_urls  = [item for sublist in _valid_urls for item in sublist]  # flatten
_valid_urls

['https://www.lumenoptometric.com',
 'https://scheduleyourexam.com/v3/index.php/4784/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/',
 'https://www.lumenoptometric.com/scleral-lenses/',
 'https://www.lumenoptometric.com/poseyedon-lens/',
 'https://www.lumenoptometric.com/neurolens-therapy/',
 'https://www.lumenoptometric.com/orthokeratology/',
 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/',
 'https://www.lumenoptometric.com/orthoke

In [44]:
flatten_valid_hrefs = [item for sublist in valid_hrefs for item in sublist]
flatten_valid_hrefs 

['https://www.lumenoptometric.com',
 'https://scheduleyourexam.com/v3/index.php/4784/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/',
 'https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/',
 'https://www.lumenoptometric.com/scleral-lenses/',
 'https://www.lumenoptometric.com/poseyedon-lens/',
 'https://www.lumenoptometric.com/neurolens-therapy/',
 'https://www.lumenoptometric.com/orthokeratology/',
 'https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/',
 'https://www.lumenoptometric.com/orthoke

In [55]:
len(flatten_valid_hrefs)

TypeError: object of type 'NoneType' has no len()

In [53]:
flatten_valid_hrefs = flatten_valid_hrefs.append(l_valid_hrefs_0)

In [54]:
len(flatten_valid_hrefs)

TypeError: object of type 'NoneType' has no len()

flatten_valide_hrefs = flatten_valide_hrefs.append(l_valid_hrefs_0)

In [48]:
unique_valid_hrefs = list(set(flatten_valid_hrefs))
len(unique_valid_hrefs)
unique_valid_hrefs

['https://www.lumenoptometric.com/wp-content/uploads/2020/05/Image4.jpg',
 'https://www.google.com/preferences?hl=en-PH&fg=1&sa=X&ved=0ahUKEwjC_-uils2FAxXXLEQIHYCtCmEQ5fUCCFY',
 'https://www.lumenoptometric.com/covid-19-protocols/',
 'https://www.yelp.com/static?p=tos#third-parties',
 'https://www.lumenoptometric.com/neurolens-therapy/',
 'http://yelp-ir.com/',
 'https://restaurants.yelp.com/',
 'https://www.lumenoptometric.com/blog/category/information/',
 'https://www.lumenoptometric.com/wp-content/uploads/2020/05/LumenSignage.jpg',
 'https://www.youtube.com/watch?v=DhtuaIazLIw',
 'https://www.lumenoptometric.com/blog/2022/04/',
 'https://trust.yelp.com',
 'https://www.lumenoptometric.com/blog/2023/01/',
 'https://www.yelp-support.com/article/Yelp-Accessibility-Statement',
 'https://business.yelp.com?utm_source=footer&utm_medium=www&utm_content=claim_footer_link&utm_campaign=claim_business',
 'https://www.yelpblog.com/section/business',
 'https://treehouseeyes.com/',
 'https://www.lu

In [50]:
unique_valid_hrefs.sort()
unique_valid_hrefs

['http://scheduleyourexam.com',
 'http://yelp-ir.com/',
 'https://biz.yelp.com/login',
 'https://blog.yelp.com/',
 'https://business.yelp.com',
 'https://business.yelp.com/products/yelp-ads',
 'https://business.yelp.com/resources/case-studies',
 'https://business.yelp.com?utm_source=footer&utm_medium=www&utm_content=claim_footer_link&utm_campaign=claim_business',
 'https://developers.google.com/youtube',
 'https://policies.google.com/privacy?hl=en-PH&fg=1',
 'https://policies.google.com/terms?hl=en-PH&fg=1',
 'https://restaurants.yelp.com/',
 'https://restaurants.yelp.com/products/waitlist-table-management-software/',
 'https://scheduleyourexam.com/sye_privacy_policy.pdf',
 'https://scheduleyourexam.com/v3/index.php/4784/',
 'https://treehouseeyes.com/',
 'https://trust.yelp.com',
 'https://tv.youtube.com/learn/nflsundayticket',
 'https://www.crystalpm.com/index.jsp?crystalpmid=4784&pageid=2',
 'https://www.facebook.com/LumenOptometric/',
 'https://www.google.com/preferences?hl=en-PH&f

In [39]:
for _href in l_valid_hrefs:
  l_valid_hrefs1, l_hrefs1  = list_website_links(_href, verbose=True)

valid hrefs, len=44:
https://www.lumenoptometric.com
https://scheduleyourexam.com/v3/index.php/4784/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/is-myopia-unhealthy/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/nature-versus-nurture/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/myopia-treatments/
https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/our-technology/
https://www.lumenoptometric.com/scleral-lenses/
https://www.lumenoptometric.com/poseyedon-lens/
https://www.lumenoptometric.com/neurolens-therapy/
https://www.lumenoptometric.com/orthokeratology/
https://www.lumenoptometric.com/orthokeratology/hows-ortho-k-work/
https://www.lumenoptometric.com/orthokeratology/is-ortho-k-safe/
https:/

In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com"
l_valid_hrefs, l_hrefs  = list_website_links(website_url)

In [None]:
l_hrefs

In [None]:
l_valid_hrefs

In [None]:
import requests
from bs4 import BeautifulSoup

def list_website_links(url):
  l_links = []
  """Fetches the HTML content of a website and extracts all links."""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  links = soup.find_all('a')
  
  for link in links:
    # Get the href attribute which contains the link URL
    href = link.get('href')
    # Check if the link exists and avoid empty links
    if href:
      print(href)
      l_links.append(href)
      
  return l_links      

# # Example usage
# website_url = "https://www.lumenoptometric.com/"
# result_links = list_website_links(website_url)


In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com"
l_links = list_website_links(website_url)

In [None]:
from urllib.parse import urlparse

def url_validator(x):
    try:
        result = urlparse(x)
        # print(f'result: {result}, {type(result)}')
        return all([result.scheme, result.netloc])
    except AttributeError:
        return False

In [None]:
valid_urls = []
for i, url in enumerate(l_links):
  # print(f'{i}, url')
  if url_validator(url):
    valid_urls.append(url)
  # else:
  #   print(f'not a valid url: {url}')  

In [None]:
valid_urls

In [None]:
# Example usage
website_url = "https://www.lumenoptometric.com/treehouse-eyes-myopia-management-for-children/what-is-myopia/"
list_website_links(website_url)

In [None]:
website_url = "https://www.lumenoptometric.com/blog/"
list_website_links(website_url)