### Scrape Website's URLs.  
#### This notebook use recursive function get_links to find website's URLs. Required_string and ignored_strings can be used as filters. 
#### A better alternative is to go to a website and type in sitemap.xml, e.g. https://treehouseeyes.com/sitemap.xml.


In [None]:
def write_to_file(links):
  """
  This function appends a list of links to a text file named 'data.txt'.

  Args:
      links (list): A list of strings representing the links to be written.
  """

  # Open the file in append mode (a)
  with open('data.txt', 'a') as f:
    # Write each link to the file, adding a newline character
    f.writelines(links)


In [None]:
def read_text_to_list(filename):
  """
  Reads the contents of a text file into a list, where each line is an item
  with the newline character removed.

  Args:
    filename: The name of the text file to read.

  Returns:
    A list containing the lines of the text file without the newline character.
  """

  # Open the file in read mode
  with open(filename, 'r') as f:
    # Read all lines from the file
    lines = f.readlines()

    # Remove newline characters from each line
    lines = [line.rstrip('\n') for line in lines]

  # Return the list of cleaned lines
  return lines


In [None]:
def is_empty_string(a):
  """
  This function checks if the input string is empty.

  Args:
      a (str): The string to check.

  Returns:
      bool: True if the string is empty, False otherwise.
  """
  return a == ""

# Example usage
a = ""
if is_empty_string(a):
  print("The string is empty")


In [None]:
def found_no_ignored_strings(ignored_strings, link_url):
  """
  This function checks if any of the strings in 'ignored_strings' are present in 'link_url'.

  Args:
      ignored_strings: A list of strings to be ignored.
      link_url: A string containing the URL to check.

  Returns:
      A tuple containing two elements:
          - A list of booleans indicating whether each string in 'ignored_strings' is found in 'link_url'.
          - A boolean indicating if none of the ignored strings are found in the link URL.
  """

  has_ignored_string = []
  for _str in ignored_strings:
    # Check if the current string is empty
    if is_empty_string(_str):
      has_ignored_string.append(False)
    else:
      # Check if the current string is present in the link URL
      has_ignored_string.append(_str in link_url)

  # Check for no ignored strings
  no_ignored_strings_found = not any(has_ignored_string)

  # Return results as a tuple
  return no_ignored_strings_found

# Replace is_empty_string with your function to check if a string is empty
# Example usage assuming is_empty_string is defined elsewhere
ignored_strings = ["ex", ".pdf"]
link_url = "https://www.example.com/important_information"

no_ignored_strings_found = found_no_ignored_strings(ignored_strings, link_url)

print(f'link_url: {link_url}')
print(f'ignored_strings: {ignored_strings}')
# print(f"has_ignored_string: {results}")
print(f"link_url does not have any ignored_strings: {no_ignored_strings_found}")
print(f"type(no_ignored_strings_found): {type(no_ignored_strings_found)}")

In [None]:
def get_links(url, counter, verbose=False):
  """
  This function recursively crawls web pages and extracts links that meet specific criteria.

  Args:
    url (str): The URL of the web page to crawl.
    counter (int): The current depth of the crawl (starts at 0).
    verbose (bool, optional): Whether to print detailed information during crawling. Defaults to False.

  Returns:
    None
  """

  # Check if maximum crawl depth is reached
  if counter >= max_depth:
    if verbose:
      print(f'get_links_counter:  {counter}, url: {url}')
  else:
    # Fetch the web page content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract links from the page
    for link in soup.find_all('a'):
      link_url = link.get('href')

      # Filter links based on criteria, exclude all items in ignored_strings
      if link_url is not None and \
        link_url.startswith('http') and \
        required_string in link_url and \
        found_no_ignored_strings(ignored_strings, link_url):
        if verbose:
          print(f'get_links_for-loop: {counter}, url: {link_url}')

        # Write the filtered link to a file
        write_to_file(link_url + '\n')

        # Recursively crawl the extracted link (incrementing depth)
        get_links(link_url, counter=counter+1, verbose=verbose)

In [None]:
import sys
import requests
sys.path.append('../..')  # need this to call function from directories above
from bs4 import BeautifulSoup
from py3810.myUtils import pickle_dump, pickle_load

verbose = True
path_lumen_docs = '..\langchain\docs\lumen\\docs\\'

# required_string = "https://www.lumenoptometric.com"
# filename_pickle='lumen_unique_urls'

# required_string = "https://treehouseeyes.com"
# filename_pickle='treehouseeyes_unique_urls'

# # ignore urls with these list items
# # ignored_strings = ['.jpg', '.pdf']
# ignored_strings = ['.jpg']

# url = required_string
# max_depth = 3

In [250]:
required_string = ""

ignored_strings = [
  'twitter.com',
  'developers.google.com',
  'https://www.youtube.com/about/',
  'https://www.youtube.com/about/',
  'https://www.youtube.com/ads/',
  'https://www.youtube.com/creators/',
  'https://www.youtube.com/howyoutubeworks',
  'tv.youtube.com/',
  'https://www.facebook.com/sharer/sharer',
  'https://www.linkedin.com/shareArticle'
]

url = "https://treehouseeyes.com/in-the-news/"
max_depth = 2  # crawl depth for recursive get_links function

In [251]:
# Loop exits on TypeError: 'NoneType' object is not iterable
for link in get_links(url, counter=0, verbose=verbose):
  if verbose:
    print(f'link_initial_for-loop: {link}')
  get_links(link, counter=0, verbose=verbose)

get_links_for-loop: 0, url: https://treehouseeyes.com/
get_links_for-loop: 1, url: https://treehouseeyes.com/
get_links_counter:  2, url: https://treehouseeyes.com/
get_links_for-loop: 1, url: https://treehouseeyes.com/why-myopia-matters/
get_links_counter:  2, url: https://treehouseeyes.com/why-myopia-matters/
get_links_for-loop: 1, url: https://treehouseeyes.com/parents-faq/
get_links_counter:  2, url: https://treehouseeyes.com/parents-faq/
get_links_for-loop: 1, url: https://treehouseeyes.com/myopia-resource-center/
get_links_counter:  2, url: https://treehouseeyes.com/myopia-resource-center/
get_links_for-loop: 1, url: https://treehouseeyes.com/what-is-my-childs-risk/
get_links_counter:  2, url: https://treehouseeyes.com/what-is-my-childs-risk/
get_links_for-loop: 1, url: https://treehouseeyes.com/our-myopia-treatments/
get_links_counter:  2, url: https://treehouseeyes.com/our-myopia-treatments/
get_links_for-loop: 1, url: https://treehouseeyes.com/our-myopia-treatments/daytime-sof

SSLError: HTTPSConnectionPool(host='sanfrancisco.momcollective.com', port=443): Max retries exceeded with url: /mom-life/health/6-things-all-parents-need-to-know-about-myopia/ (Caused by SSLError(CertificateError("hostname 'sanfrancisco.momcollective.com' doesn't match either of '*.sucuri.net', 'sucuri.net'")))

In [252]:
text_file = "data.txt"
lines_list = read_text_to_list(text_file)

unique_urls = sorted(set(lines_list))
print(f'unique links in data.txt: {len(unique_urls)}')
unique_urls

unique links in data.txt: 1177


['http://2020mag.com',
 'http://MeTVNewMexico.com',
 'http://corporate.hubbardradio.com/',
 'http://corporate.hubbardradio.com/dmca-notice/',
 'http://corporate.hubbardradio.com/privacy-policy/',
 'http://corporate.hubbardradio.com/terms-of-use/',
 'http://fdhfairfax.us-east-2.elasticbeanstalk.com',
 'http://framesdata.com',
 'http://games.wtop.com/',
 'http://https://treehouseeyes.com/',
 'http://its.txdot.gov/ITS_WEB/FrontEnd/default.html?r=DAL1&p=Dallas&t=map',
 'http://jobs.modernmedicine.com/index.php',
 'http://jobs.modernmedicine.com/index.php?action=advanced_search&page=search&keywords=&mult_industries%5B%5D=Ophthalmology&country=&zip_radius=&zip=&position_type=&min_salary=&max_salary=&salary_type=',
 'http://jobs.modernmedicine.com/index.php?action=advanced_search&page=search&keywords=&mult_industries%5B%5D=Optometry&country=&zip_radius=&zip=&position_type=&min_salary=&max_salary=&salary_type=',
 'http://jobsoninteractive.com',
 'http://jobsonresearch.com',
 'http://ktre.thund

In [249]:
import os

# Specify the file path to be deleted
file_path = "data.txt"
# Check if the file exists before deleting
if os.path.exists(file_path):
    try:
        os.remove(file_path)
        print("File deleted successfully!")
    except OSError as e:
        print("Error deleting file:", e)
else:
    print("File not found:", file_path)


File deleted successfully!


In [253]:
pickle_dump(file_to_pickle=unique_urls, path_pickle_dump=path_lumen_docs, filename_pickle='treehouseeyes-in-the-news_max_depth_2')
unique_urls

['http://2020mag.com',
 'http://MeTVNewMexico.com',
 'http://corporate.hubbardradio.com/',
 'http://corporate.hubbardradio.com/dmca-notice/',
 'http://corporate.hubbardradio.com/privacy-policy/',
 'http://corporate.hubbardradio.com/terms-of-use/',
 'http://fdhfairfax.us-east-2.elasticbeanstalk.com',
 'http://framesdata.com',
 'http://games.wtop.com/',
 'http://https://treehouseeyes.com/',
 'http://its.txdot.gov/ITS_WEB/FrontEnd/default.html?r=DAL1&p=Dallas&t=map',
 'http://jobs.modernmedicine.com/index.php',
 'http://jobs.modernmedicine.com/index.php?action=advanced_search&page=search&keywords=&mult_industries%5B%5D=Ophthalmology&country=&zip_radius=&zip=&position_type=&min_salary=&max_salary=&salary_type=',
 'http://jobs.modernmedicine.com/index.php?action=advanced_search&page=search&keywords=&mult_industries%5B%5D=Optometry&country=&zip_radius=&zip=&position_type=&min_salary=&max_salary=&salary_type=',
 'http://jobsoninteractive.com',
 'http://jobsonresearch.com',
 'http://ktre.thund