In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import heapq

class WebCrawler:
    def __init__(self, base_url, max_depth=3):
        self.base_url = base_url
        self.visited = set()
        self.max_depth = max_depth
        self.priority_queue = []
        self.indexed_links = []  # List to store all indexed links
        
        # Counters for each algorithm
        self.bfs_count = 0
        self.dfs_count = 0
        self.a_star_count = 0

    def get_html(self, url):
        """Fetch HTML content using requests."""
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an error for bad responses
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def extract_links(self, html, base_url):
        """Extract links from the HTML content."""
        links = set()
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            full_url = urljoin(base_url, href)
            links.add(full_url)
        return links

    def heuristic(self, url):
        """Determine which algorithm to use based on the URL heuristics."""
        # Example heuristics for algorithm selection
        if len(url) > 50:
            return "A*"  # Use A* for longer URLs
        elif url.count('/') > 2:
            return "DFS"  # Use DFS for deeper paths
        else:
            return "BFS"  # Use BFS for shallow paths

    def crawl(self, url, depth=0):
        """Main crawl function that dynamically switches algorithms."""
        if depth > self.max_depth or url in self.visited:
            return

        print(f"Crawling: {url} at depth {depth} (Visited: {len(self.visited)})")
        self.visited.add(url)

        html = self.get_html(url)
        if html:
            links = self.extract_links(html, url)
            self.indexed_links.extend(links)  # Store all indexed links

            for link in links:
                algo = self.heuristic(link)

                if algo == "A*":
                    print(f"[Using A* Algorithm] Crawling: {link}")
                    self.crawl_a_star(link, depth + 1)
                elif algo == "DFS":
                    print(f"[Using DFS Algorithm] Crawling: {link}")
                    self.crawl_dfs(link, depth + 1)
                else:
                    print(f"[Using BFS Algorithm] Crawling: {link}")
                    self.crawl_bfs(link, depth + 1)

    def crawl_dfs(self, url, depth):
        """Crawl using Depth-First Search (DFS)."""
        if depth > self.max_depth or url in self.visited:
            return
        self.visited.add(url)
        self.dfs_count += 1  # Increment DFS counter
        self.crawl(url, depth)

    def crawl_bfs(self, url, depth):
        """Crawl using Breadth-First Search (BFS)."""
        if depth > self.max_depth or url in self.visited:
            return
        self.visited.add(url)
        self.bfs_count += 1  # Increment BFS counter
        self.crawl(url, depth)

    def crawl_a_star(self, url, depth):
        """Crawl using A* Search."""
        if depth > self.max_depth or url in self.visited:
            return
        self.visited.add(url)
        self.a_star_count += 1  # Increment A* counter

        priority = len(url)  # Example heuristic: prioritize shorter URLs
        heapq.heappush(self.priority_queue, (priority, url))

        while self.priority_queue:
            priority, current_url = heapq.heappop(self.priority_queue)
            print(f"[A*] Processing URL with priority {priority}: {current_url}")
            self.crawl(current_url, depth)

    def extract_content(self, link):
        """Fetch content from the selected link."""
        content = self.get_html(link)
        if content:
            print("\nContent extracted from the selected link:")
            print(content[:500])  # Print the first 500 characters

    def run(self):
        print("Starting crawl...")
        self.crawl(self.base_url)

        print("\nIndexed Links:")
        unique_links = list(set(self.indexed_links))  # Remove duplicates
        for idx, link in enumerate(unique_links):
            print(f"{idx + 1}: {link}")

        print(f"\nCrawling Summary:\n"
              f"Total pages crawled using BFS: {self.bfs_count}\n"
              f"Total pages crawled using DFS: {self.dfs_count}\n"
              f"Total pages crawled using A*: {self.a_star_count}\n")

        while True:
            choice = input("\nEnter the number of the link you want to extract info from (or 'exit' to quit): ")
            if choice.lower() == 'exit':
                break
            try:
                link_index = int(choice) - 1
                if 0 <= link_index < len(unique_links):
                    selected_link = unique_links[link_index]
                    self.extract_content(selected_link)
                else:
                    print("Invalid choice. Please select a valid link number.")
            except ValueError:
                print("Invalid input. Please enter a number or 'exit'.")

if __name__ == '__main__':
    base_url = input("Enter the starting URL: ")  # Prompt for starting URL
    crawler = WebCrawler(base_url)

    crawler.run()


Enter the starting URL:   https://github.com/


Starting crawl...
Crawling:  https://github.com/ at depth 0 (Visited: 0)
[Using A* Algorithm] Crawling: https://github.com/organizations/enterprise_plan?ref_cta=Start+a+free+enterprise+trial&ref_loc=Home+campaign+footer&ref_page=%2F
[A*] Processing URL with priority 128: https://github.com/organizations/enterprise_plan?ref_cta=Start+a+free+enterprise+trial&ref_loc=Home+campaign+footer&ref_page=%2F
[Using DFS Algorithm] Crawling: https://github.com/features/copilot#enterprise
[Using DFS Algorithm] Crawling: https://github.com/#productivity
[Using A* Algorithm] Crawling: https://docs.github.com/get-started/exploring-integrations/about-building-integrations
[A*] Processing URL with priority 86: https://docs.github.com/get-started/exploring-integrations/about-building-integrations
[Using A* Algorithm] Crawling: https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax
[A*] Processing URL with priority 96: https://docs.github.com/search-github/github-c