In [None]:
# Install Selenium
!pip install selenium
# Install the webdriver_manager
!pip install webdriver_manager
# Update and install ChromeDriver
!apt-get update
!apt install chromium-chromedriver

# Copy chromedriver to a known path
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# Set Chrome options for Selenium
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize WebDriver with the specified options
driver = webdriver.Chrome(options=chrome_options)




Collecting selenium
  Downloading selenium-4.18.1-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.25.0-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.2/467.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv

# Create a session
session = requests.Session()

# Define the number of connections you want
max_connections = 10

# Create an HTTPAdapter with your desired connection pool size
adapter = HTTPAdapter(pool_connections=max_connections, pool_maxsize=max_connections)

# Mount the adapter to your session for both HTTP and HTTPS
session.mount('http://', adapter)
session.mount('https://', adapter)

response = session.get('https://example.com')
print(response.status_code)

class AIWebCrawler:
    def __init__(self, start_url, max_depth, visit_strategy='preorder'):
        self.start_url = start_url
        self.max_depth = max_depth
        self.visit_strategy = visit_strategy.lower()
        self.visited_urls = set()
        self.corpus = []  # List to store text content

    def get_internal_links(self, page_url, content):
        soup = BeautifulSoup(content, 'html.parser')
        internal_links = set()

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            absolute_url = urljoin(page_url, href)

            # Check if the link is within the same domain
            if urlparse(absolute_url).netloc == urlparse(page_url).netloc:
                internal_links.add(absolute_url)

        return internal_links

    def extract_text_content(self, content):
        soup = BeautifulSoup(content, 'html.parser')
        # Remove script and style tags
        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()
        # Extract text content
        text_content = soup.get_text(separator=' ', strip=True)
        return text_content

    def crawl_page(self, driver, url, depth):
      if depth == 0 or url in self.visited_urls:
        return

      try:
        # Navigate to the page using Selenium
        driver.get(url)
        # Optionally wait for a specific element to ensure the page has loaded
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # Now the page's dynamic content should be loaded, extract the page source
        content = driver.page_source
        # Process the page source with BeautifulSoup as before to extract text and links
        text_content = self.extract_text_content(content)
        self.corpus.append({'url': url, 'content': text_content})
        internal_links = self.get_internal_links(url, content)
        self.visited_urls.add(url)

        # Recursively crawl the internal links
        for link in self.get_ordered_links(internal_links):
            self.crawl_page(driver, link, depth - 1)

      except Exception as e:
        print(f"Error crawling {url}: {str(e)}")

    def get_ordered_links(self, internal_links):
        if self.visit_strategy == 'inorder':
            return [self.start_url] + list(internal_links)
        elif self.visit_strategy == 'postorder':
            return list(internal_links) + [self.start_url]
        else:  # Default to preorder
            return list(internal_links)

    def save_corpus_to_csv(self, csv_filename='corpus.csv'):
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
            fieldnames = ['url', 'content']
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.corpus)

    def start_crawling(self):
    # Use the driver initialized at the beginning of the script
      self.crawl_page(driver, self.start_url, self.max_depth)
      self.save_corpus_to_csv()

200


#Evaluation of Other Visiting Strategies#

##Preorder Traversal##

In [None]:
%%time
if __name__ == "__main__":
    start_url = "https://www.stjohns.edu/"
    max_depth = 3  # Set the maximum depth of crawling

    # Instantiate the WebCrawler with the desired visiting strategy
    crawler = AIWebCrawler(start_url, max_depth, visit_strategy='preorder')

    # Start crawling and save the corpus to a CSV file
    crawler.start_crawling()

##In-Order Traversal##

In [None]:
%%time
if __name__ == "__main__":
    start_url = "https://www.stjohns.edu/"
    max_depth = 3  # Set the maximum depth of crawling

    # Instantiate the WebCrawler with the desired visiting strategy
    crawler = AIWebCrawler(start_url, max_depth, visit_strategy='inorder')

    # Start crawling and save the corpus to a CSV file
    crawler.start_crawling()

Error crawling https://www.stjohns.edu/queens-residential-campus/queens-campus-life/campus-activities: Message: timeout: Timed out receiving message from renderer: 298.486
  (Session info: chrome-headless-shell=123.0.6312.58)
Stacktrace:
#0 0x598ea42c2993 <unknown>
#1 0x598ea3fbd136 <unknown>
#2 0x598ea3fa5020 <unknown>
#3 0x598ea3fa4d43 <unknown>
#4 0x598ea3fa2d96 <unknown>
#5 0x598ea3fa341f <unknown>
#6 0x598ea3fb36e5 <unknown>
#7 0x598ea3fc8ebc <unknown>
#8 0x598ea3fce48b <unknown>
#9 0x598ea3fa3aa2 <unknown>
#10 0x598ea3fc8c34 <unknown>
#11 0x598ea4048671 <unknown>
#12 0x598ea4029a73 <unknown>
#13 0x598ea3ffac93 <unknown>
#14 0x598ea3ffb65e <unknown>
#15 0x598ea428708b <unknown>
#16 0x598ea428b005 <unknown>
#17 0x598ea4275491 <unknown>
#18 0x598ea428bb92 <unknown>
#19 0x598ea425a9ef <unknown>
#20 0x598ea42b1df8 <unknown>
#21 0x598ea42b1fcb <unknown>
#22 0x598ea42c1ae4 <unknown>
#23 0x79b2f9abeac3 <unknown>

Error crawling https://www.stjohns.edu/admission/connect-us/undergraduate-p

##Post-Order Traversal##

In [None]:
%%time
if __name__ == "__main__":
    start_url = "https://www.stjohns.edu/"
    max_depth = 3  # Set the maximum depth of crawling

    # Instantiate the WebCrawler with the desired visiting strategy
    crawler = AIWebCrawler(start_url, max_depth, visit_strategy='postorder')

    # Start crawling and save the corpus to a CSV file
    crawler.start_crawling()