In [3]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

class WebCrawler:
    def __init__(self, start_url, max_pages):
        self.start_url = start_url
        self.max_pages = max_pages
        self.visited = set()
        self.queue = deque()
        self.stack = []
        self.use_bfs = True
        self.total_time = 0
        self.num_crawls = 0

    def crawl(self):
        start_time = time.time()
        if self.use_bfs:
            self.queue.append(self.start_url)
            while self.queue and len(self.visited) < self.max_pages:
                current_url = self.queue.popleft()
                if current_url not in self.visited:
                    self.visited.add(current_url)
                    print("Crawling:", current_url)
                    self.parse_url(current_url)
        else:
            self.stack.append(self.start_url)
            while self.stack and len(self.visited) < self.max_pages:
                current_url = self.stack.pop()
                if current_url not in self.visited:
                    self.visited.add(current_url)
                    print("Crawling:", current_url)
                    self.parse_url(current_url)
        end_time = time.time()
        crawl_time = end_time - start_time
        self.total_time += crawl_time
        self.num_crawls += 1
        return crawl_time

    def parse_url(self, url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                # Extract links from the page
                links = [link.get("href") for link in soup.find_all("a", href=True)]
                self.enqueue_links(links) if self.use_bfs else self.push_links(links)
        except Exception as e:
            print("Failed to crawl:", url)
            print(e)

    def enqueue_links(self, links):
        for link in links:
            if link and link not in self.visited:
                self.queue.append(link)

    def push_links(self, links):
        for link in links:
            if link and link not in self.visited:
                self.stack.append(link)

    def switch_algorithm(self):
        self.use_bfs = not self.use_bfs

    def average_crawl_time(self):
        return self.total_time / self.num_crawls if self.num_crawls > 0 else 0

# Example usage:
crawler = WebCrawler("https://kayak.com", 20)

# Crawling using BFS
for _ in range(5):
    crawl_time = crawler.crawl()
    print(f"Time taken for BFS crawl: {crawl_time:.2f} seconds")

# After crawling with BFS, switch to DFS
crawler.switch_algorithm()

# Crawling using DFS
for _ in range(5):
    crawl_time = crawler.crawl()
    print(f"Time taken for DFS crawl: {crawl_time:.2f} seconds")

print(f"Average time taken to crawl: {crawler.average_crawl_time():.2f} seconds")


Crawling: https://kayak.com
Crawling: #Vp0X-pageContent
Failed to crawl: #Vp0X-pageContent
Invalid URL '#Vp0X-pageContent': No scheme supplied. Perhaps you meant https://#Vp0X-pageContent?
Crawling: /
Failed to crawl: /
Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Crawling: /flights
Failed to crawl: /flights
Invalid URL '/flights': No scheme supplied. Perhaps you meant https:///flights?
Crawling: /stays
Failed to crawl: /stays
Invalid URL '/stays': No scheme supplied. Perhaps you meant https:///stays?
Crawling: /cars
Failed to crawl: /cars
Invalid URL '/cars': No scheme supplied. Perhaps you meant https:///cars?
Crawling: /packages
Failed to crawl: /packages
Invalid URL '/packages': No scheme supplied. Perhaps you meant https:///packages?
Crawling: /trains
Failed to crawl: /trains
Invalid URL '/trains': No scheme supplied. Perhaps you meant https:///trains?
Crawling: #
Failed to crawl: #
Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
Crawling: ht

In [5]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
from urllib.parse import urlparse, urljoin

class WebCrawler:
    def __init__(self, start_url, max_pages):
        self.start_url = start_url
        self.max_pages = max_pages
        self.visited = set()
        self.queue = deque()
        self.stack = []
        self.use_bfs = True
        self.total_time = 0
        self.num_crawls = 0

    def crawl(self):
        start_time = time.time()
        if self.use_bfs:
            self.queue.append(self.start_url)
            while self.queue and len(self.visited) < self.max_pages:
                current_url = self.queue.popleft()
                if current_url not in self.visited:
                    self.visited.add(current_url)
                    print("Crawling:", current_url)
                    self.parse_url(current_url)
        else:
            self.stack.append(self.start_url)
            while self.stack and len(self.visited) < self.max_pages:
                current_url = self.stack.pop()
                if current_url not in self.visited:
                    self.visited.add(current_url)
                    print("Crawling:", current_url)
                    self.parse_url(current_url)
        end_time = time.time()
        crawl_time = end_time - start_time
        self.total_time += crawl_time
        self.num_crawls += 1
        return crawl_time

    def parse_url(self, url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                # Extract links from the page
                links = [link.get("href") for link in soup.find_all("a", href=True)]
                valid_links = self.filter_valid_links(links, url)
                self.enqueue_links(valid_links) if self.use_bfs else self.push_links(valid_links)
        except Exception as e:
            print("Failed to crawl:", url)
            print(e)

    def filter_valid_links(self, links, base_url):
        valid_links = []
        for link in links:
            parsed = urlparse(link)
            if parsed.scheme and parsed.netloc:  # Absolute URL
                valid_links.append(link)
            else:  # Relative URL - construct absolute URL
                absolute_url = urljoin(base_url, link)
                valid_links.append(absolute_url)
        return valid_links

    def enqueue_links(self, links):
        for link in links:
            if link and link not in self.visited:
                self.queue.append(link)

    def push_links(self, links):
        for link in links:
            if link and link not in self.visited:
                self.stack.append(link)

    def switch_algorithm(self):
        self.use_bfs = not self.use_bfs

    def average_crawl_time(self):
        return self.total_time / self.num_crawls if self.num_crawls > 0 else 0

# Example usage:
crawler = WebCrawler("https://kayak.com", 20)

# Crawling using BFS
for _ in range(5):
    crawl_time = crawler.crawl()
    print(f"Time taken for BFS crawl: {crawl_time:.2f} seconds")

# After crawling with BFS, switch to DFS
crawler.switch_algorithm()

# Crawling using DFS
for _ in range(5):
    crawl_time = crawler.crawl()
    print(f"Time taken for DFS crawl: {crawl_time:.2f} seconds")

print(f"Average time taken to crawl: {crawler.average_crawl_time():.2f} seconds")


Crawling: https://kayak.com
Crawling: https://kayak.com#PFTl-pageContent
Crawling: https://kayak.com/
Crawling: https://kayak.com/flights
Crawling: https://kayak.com/stays
Crawling: https://kayak.com/cars
Crawling: https://kayak.com/packages
Crawling: https://kayak.com/trains
Crawling: https://www.kayak.com.ar/
Crawling: https://www.kayak.com.au/
Crawling: https://www.be.kayak.com/
Crawling: https://www.kayak.bo/
Crawling: https://www.kayak.com.br/
Crawling: https://www.ca.kayak.com/
Crawling: https://www.kayak.cl/
Crawling: https://www.cn.kayak.com/
Crawling: https://www.kayak.com.co/
Crawling: https://www.kayak.co.cr/
Crawling: https://www.kayak.dk/
Crawling: https://www.kayak.de/
Time taken for BFS crawl: 24.77 seconds
Time taken for BFS crawl: 0.00 seconds
Time taken for BFS crawl: 0.00 seconds
Time taken for BFS crawl: 0.00 seconds
Time taken for BFS crawl: 0.00 seconds
Time taken for DFS crawl: 0.00 seconds
Time taken for DFS crawl: 0.00 seconds
Time taken for DFS crawl: 0.00 sec

In [10]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
from urllib.parse import urlparse, urljoin

In [12]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
from urllib.parse import urlparse, urljoin

class WebCrawler:
    def __init__(self, start_url, max_pages):
        self.start_url = start_url
        self.max_pages = max_pages
        self.visited = set()
        self.queue = deque()
        self.use_bfs = True
        self.total_time = 0
        self.num_crawls = 0

    def crawl(self):
        start_time = time.time()
        while len(self.visited) < self.max_pages:
            if self.use_bfs and self.queue:
                current_url = self.queue.popleft()
            elif not self.use_bfs and self.queue:
                current_url = self.queue.pop()
            else:
                break
            if current_url not in self.visited:
                try:
                    self.visited.add(current_url)
                    print("Crawling:", current_url)
                    self.parse_url(current_url)
                except Exception as e:
                    print(f"Failed to crawl: {current_url} - {e}")
        end_time = time.time()
        crawl_time = end_time - start_time
        self.total_time += crawl_time
        self.num_crawls += 1
        return crawl_time

    def parse_url(self, url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                # Extract links from the page
                links = [link.get("href") for link in soup.find_all("a", href=True)]
                valid_links = self.filter_valid_links(links, url)
                self.enqueue_links(valid_links)
        except Exception as e:
            print("Failed to crawl:", url)
            print(e)

    def filter_valid_links(self, links, base_url):
        valid_links = []
        for link in links:
            parsed = urlparse(link)
            if parsed.scheme and parsed.netloc:  # Absolute URL
                valid_links.append(link)
            else:  # Relative URL - construct absolute URL
                absolute_url = urljoin(base_url, link)
                valid_links.append(absolute_url)
        return valid_links

    def enqueue_links(self, links):
        for link in links:
            if link and link not in self.visited:
                self.queue.append(link)

    def switch_algorithm(self):
        self.use_bfs = not self.use_bfs
        if not self.use_bfs:
            print("Switching from BFS to DFS")

    def average_crawl_time(self):
        return self.total_time / self.num_crawls if self.num_crawls > 0 else 0

# Example usage:
crawler = WebCrawler("https://kayak.com", 20)

# Crawling
for _ in range(10):
    crawl_time = crawler.crawl()
    print(f"Time taken for crawl: {crawl_time:.2f} seconds")
    if crawler.num_crawls % 5 == 0:
        crawler.switch_algorithm()

print(f"Average time taken to crawl the full website: {crawler.average_crawl_time():.2f} seconds")


Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Switching from BFS to DFS
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Time taken for crawl: 0.00 seconds
Average time taken to crawl the full website: 0.00 seconds


In [8]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links, soup.title.text if soup.title else "No Title Found"
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return [], "No Title Found"

# Function for dynamically adaptive crawling
def dynamic_crawl(seed_url, max_depth):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0
    use_bfs = True  # Start with BFS

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links, title = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds - Title: {title}")

            # Switch to DFS if the number of links exceeds a threshold
            if use_bfs and len(links) > 10:
                use_bfs = False
                print("Switching from BFS to DFS")

            # You might perform specific actions here based on the page content
            # For example, print the title of the page
            print(f"Title of {current_url}: {title}")

            # Add links to visit based on the chosen algorithm (BFS/DFS)
            for link in links:
                if link and link.startswith('http') and link not in visited:
                    if use_bfs:
                        pages_to_visit.append((link, depth + 1))
                    else:
                        pages_to_visit.appendleft((link, depth + 1))

    return total_time / num_pages if num_pages > 0 else 0

# Example usage
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl

start_time_total = time.time()
avg_time = dynamic_crawl(seed_url, max_crawl_depth)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")


Crawling https://kayak.com at depth 1 - Elapsed Time: 1.74 seconds - Title: Search Flights, Hotels & Rental Cars | KAYAK
Switching from BFS to DFS
Title of https://kayak.com: Search Flights, Hotels & Rental Cars | KAYAK
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 2 - Elapsed Time: 1.19 seconds - Title: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Title of https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Crawling https://www.apple.com/sitemap/ at depth 3 - Elapsed Time: 1.20 seconds - Title: Site Map - Apple
Title of https://www.apple.com/sitemap/: Site Map - Apple
Crawling https://www.apple.com/legal/ at depth 3 - Elapsed Time: 0.91 seconds - Title: Apple - Legal
Title of https://www.apple.com/legal/: Apple - Legal
Crawling https://www.apple.com/us/shop/goto/help/sales_refunds at depth 3 - Elapsed Time: 3.04 seconds - Title: Sales Policies - App

In [9]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links, soup.title.text if soup.title else "No Title Found"
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return [], "No Title Found"

# Function for dynamically adaptive crawling with prioritized URL ordering
def prioritized_crawl(seed_url, max_depth):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0
    use_bfs = True  # Start with BFS

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links, title = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds - Title: {title}")

            # Switch to DFS if the number of links exceeds a threshold
            if use_bfs and len(links) > 10:
                use_bfs = False
                print("Switching from BFS to DFS")

            # Sort links based on URL length before adding to the queue
            sorted_links = sorted(links, key=lambda x: len(x))

            # Add links to visit based on the chosen algorithm (BFS/DFS)
            for link in sorted_links:
                if link and link.startswith('http') and link not in visited:
                    if use_bfs:
                        pages_to_visit.append((link, depth + 1))
                    else:
                        pages_to_visit.appendleft((link, depth + 1))

    return total_time / num_pages if num_pages > 0 else 0

# Example usage with prioritized URL ordering
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl

start_time_total = time.time()
avg_time = prioritized_crawl(seed_url, max_crawl_depth)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")


Crawling https://kayak.com at depth 1 - Elapsed Time: 1.50 seconds - Title: Search Flights, Hotels & Rental Cars | KAYAK
Switching from BFS to DFS
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 2 - Elapsed Time: 1.10 seconds - Title: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535?eventid=6472814118 at depth 3 - Elapsed Time: 0.85 seconds - Title: 
      âKAYAK Price Alerts: Stop the what-ifs and start your happy dance.
    
Crawling https://apps.apple.com/us/app/checkfelix-fl%C3%BCge-hotels-autos/id492728058 at depth 3 - Elapsed Time: 1.07 seconds - Title: 
      âcheckfelix: FlÃ¼ge Hotels Autos on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/mundi-voos-hot%C3%A9is-e-carros/id600097979 at depth 3 - Elapsed Time: 1.06 seconds - Title: 
      âMundi: Voos, HotÃ©is e Carros on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/swoo

TEMP

In [10]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links, soup.title.text if soup.title else "No Title Found", soup.get_text()
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return [], "No Title Found", ""

# Function for dynamically adaptive crawling with prioritized URL ordering and search frequency
def prioritized_crawl_with_frequency(seed_url, max_depth, search_keywords):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0
    use_bfs = True  # Start with BFS
    keyword_frequency = {keyword: 0 for keyword in search_keywords}

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links, title, page_text = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds - Title: {title}")

            # Count keyword frequencies in page content
            for keyword in search_keywords:
                keyword_frequency[keyword] += page_text.lower().count(keyword.lower())

            # Switch to DFS if the number of links exceeds a threshold
            if use_bfs and len(links) > 10:
                use_bfs = False
                print("Switching from BFS to DFS")

            # Sort links based on URL length before adding to the queue
            sorted_links = sorted(links, key=lambda x: len(x))

            # Add links to visit based on the chosen algorithm (BFS/DFS)
            for link in sorted_links:
                if link and link.startswith('http') and link not in visited:
                    if use_bfs:
                        pages_to_visit.append((link, depth + 1))
                    else:
                        pages_to_visit.appendleft((link, depth + 1))

    return total_time / num_pages if num_pages > 0 else 0, keyword_frequency

# Example usage with prioritized URL ordering and search frequency
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl
search_keywords = ['travel', 'flight', 'hotel', 'price', 'search', 'holiday', 'destination']  # Keywords to search and track frequencies

start_time_total = time.time()
avg_time, keyword_freq = prioritized_crawl_with_frequency(seed_url, max_crawl_depth, search_keywords)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")
print("Keyword Frequencies:")
for keyword, frequency in keyword_freq.items():
    print(f"{keyword.capitalize()}: {frequency}")


Crawling https://kayak.com at depth 1 - Elapsed Time: 1.61 seconds - Title: Search Flights, Hotels & Rental Cars | KAYAK
Switching from BFS to DFS
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 2 - Elapsed Time: 0.74 seconds - Title: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535?eventid=6472814118 at depth 3 - Elapsed Time: 0.64 seconds - Title: 
      âKAYAK Price Alerts: Stop the what-ifs and start your happy dance.
    
Crawling https://apps.apple.com/us/app/checkfelix-fl%C3%BCge-hotels-autos/id492728058 at depth 3 - Elapsed Time: 0.74 seconds - Title: 
      âcheckfelix: FlÃ¼ge Hotels Autos on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/mundi-voos-hot%C3%A9is-e-carros/id600097979 at depth 3 - Elapsed Time: 0.93 seconds - Title: 
      âMundi: Voos, HotÃ©is e Carros on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/swoo

In [11]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links, soup.title.text if soup.title else "No Title Found", soup.get_text()
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return [], "No Title Found", ""

# Function for dynamically adaptive crawling with prioritized URL ordering, search frequency, and page ranking
def prioritized_crawl_with_frequency_and_ranking(seed_url, max_depth, search_keywords):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0
    use_bfs = True  # Start with BFS
    keyword_frequency = {keyword: 0 for keyword in search_keywords}
    page_ranks = {}  # Dictionary to hold page ranks

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links, title, page_text = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds - Title: {title}")

            # Count keyword frequencies in page content
            for keyword in search_keywords:
                keyword_frequency[keyword] += page_text.lower().count(keyword.lower())

            # Assign page rank based on depth
            if depth not in page_ranks:
                page_ranks[depth] = []
            page_ranks[depth].append(current_url)

            # Switch to DFS if the number of links exceeds a threshold
            if use_bfs and len(links) > 10:
                use_bfs = False
                print("Switching from BFS to DFS")

            # Sort links based on URL length before adding to the queue
            sorted_links = sorted(links, key=lambda x: len(x))

            # Add links to visit based on the chosen algorithm (BFS/DFS)
            for link in sorted_links:
                if link and link.startswith('http') and link not in visited:
                    if use_bfs:
                        pages_to_visit.append((link, depth + 1))
                    else:
                        pages_to_visit.appendleft((link, depth + 1))

    # Assign scores based on page depth (page ranking)
    rank_score = max_depth
    ranked_pages = {}
    for depth in range(max_depth, 0, -1):
        if depth in page_ranks:
            for url in page_ranks[depth]:
                ranked_pages[url] = rank_score
                rank_score -= 1

    return total_time / num_pages if num_pages > 0 else 0, keyword_frequency, ranked_pages

# Example usage with prioritized URL ordering, search frequency, and page ranking
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl
search_keywords = ['travel', 'flight', 'hotel', 'price', 'search', 'holiday', 'destination']  # Keywords to search and track frequencies

start_time_total = time.time()
avg_time, keyword_freq, page_ranks = prioritized_crawl_with_frequency_and_ranking(seed_url, max_crawl_depth, search_keywords)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")
print("Keyword Frequencies:")
for keyword, frequency in keyword_freq.items():
    print(f"{keyword.capitalize()}: {frequency}")

print("\nPage Ranks (Based on Depth):")
for url, rank in page_ranks.items():
    print(f"{url} - Rank: {rank}")


Crawling https://kayak.com at depth 1 - Elapsed Time: 1.65 seconds - Title: Search Flights, Hotels & Rental Cars | KAYAK
Switching from BFS to DFS
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 2 - Elapsed Time: 1.12 seconds - Title: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535?eventid=6472814118 at depth 3 - Elapsed Time: 0.85 seconds - Title: 
      âKAYAK Price Alerts: Stop the what-ifs and start your happy dance.
    
Crawling https://apps.apple.com/us/app/checkfelix-fl%C3%BCge-hotels-autos/id492728058 at depth 3 - Elapsed Time: 0.88 seconds - Title: 
      âcheckfelix: FlÃ¼ge Hotels Autos on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/mundi-voos-hot%C3%A9is-e-carros/id600097979 at depth 3 - Elapsed Time: 0.87 seconds - Title: 
      âMundi: Voos, HotÃ©is e Carros on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/swoo