In [1]:
pip install beautifulsoup4 requests networkx


Note: you may need to restart the kernel to use updated packages.


BFS WEB CRAWLER     

In [3]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

def bfs_crawler(start_url):
    visited = set()
    queue = deque([(start_url, 0)])
    total_time = 0
    num_pages = 0

    while queue:
        url, depth = queue.popleft()
        if url not in visited:
            try:
                start_time = time.time()
                response = requests.get(url)
                end_time = time.time()
                elapsed_time = end_time - start_time

                # Simulating processing time
                time.sleep(0.5)

                if response.status_code == 200:
                    print(f"Crawling {url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds")
                    total_time += elapsed_time
                    num_pages += 1

                    # Extract links from the page
                    links = extract_links(response.text)

                    # Add links from the page to the queue for further crawling
                    if depth < 2:  # Limiting depth to 2 for demonstration
                        for link in links:
                            queue.append((link, depth + 1))

                    visited.add(url)

            except Exception as e:
                print(f"Error crawling {url}: {e}")

    return total_time / num_pages if num_pages > 0 else 0


def extract_links(html):
    links = []
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            links.append(href)
    return links

if __name__ == "__main__":
    start_time_total = time.time()

    start_url = "https://kayak.com"  # Replace with your starting URL
    avg_time = bfs_crawler(start_url)

    end_time_total = time.time()
    total_elapsed_time = end_time_total - start_time_total

    print(f"Total time taken: {total_elapsed_time:.2f} seconds")
    print(f"Average time per page: {avg_time:.2f} seconds")


Crawling https://kayak.com at depth 0 - Elapsed Time: 1.75 seconds
Error crawling #rezq-pageContent: Invalid URL '#rezq-pageContent': No scheme supplied. Perhaps you meant https://#rezq-pageContent?
Error crawling /: Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error crawling /flights: Invalid URL '/flights': No scheme supplied. Perhaps you meant https:///flights?
Error crawling /stays: Invalid URL '/stays': No scheme supplied. Perhaps you meant https:///stays?
Error crawling /cars: Invalid URL '/cars': No scheme supplied. Perhaps you meant https:///cars?
Error crawling /packages: Invalid URL '/packages': No scheme supplied. Perhaps you meant https:///packages?
Error crawling /trains: Invalid URL '/trains': No scheme supplied. Perhaps you meant https:///trains?
Error crawling #: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
Error crawling #: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
Error crawling #: Invalid URL '#': No s

DFS WEB CRAWLER

In [7]:
import requests
from bs4 import BeautifulSoup
import time

def dfs_crawler(start_url, max_depth=2):
    visited = set()
    stack = [(start_url, 0)]
    total_time = 0
    num_pages = 0

    while stack:
        url, depth = stack.pop()
        if url not in visited:
            try:
                start_time = time.time()
                response = requests.get(url)
                end_time = time.time()
                elapsed_time = end_time - start_time

                # Simulating processing time
                time.sleep(0.5)

                if response.status_code == 200:
                    print(f"Crawling {url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds")
                    total_time += elapsed_time
                    num_pages += 1

                    # Extract links from the page
                    links = extract_links(response.text)

                    # Add links from the page to the stack for further crawling
                    if depth < max_depth:
                        for link in links:
                            stack.append((link, depth + 1))

                    visited.add(url)
            except Exception as e:
                print(f"Error crawling {url}: {e}")

    return total_time / num_pages if num_pages > 0 else 0

def extract_links(html):
    links = []
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            links.append(href)
    return links

if __name__ == "__main__":
    start_time_total = time.time()

    start_url = "https://kayak.com"  # Replace with your starting URL
    avg_time = dfs_crawler(start_url)

    end_time_total = time.time()
    total_elapsed_time = end_time_total - start_time_total

    print(f"Total time taken: {total_elapsed_time:.2f} seconds")
    print(f"Average time per page: {avg_time:.2f} seconds")


Crawling https://kayak.com at depth 0 - Elapsed Time: 3.39 seconds
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 1 - Elapsed Time: 2.45 seconds
Crawling https://www.apple.com/sitemap/ at depth 2 - Elapsed Time: 1.48 seconds
Crawling https://www.apple.com/legal/ at depth 2 - Elapsed Time: 1.28 seconds
Crawling https://www.apple.com/us/shop/goto/help/sales_refunds at depth 2 - Elapsed Time: 4.13 seconds
Crawling https://www.apple.com/legal/internet-services/terms/site.html at depth 2 - Elapsed Time: 1.01 seconds
Crawling https://www.apple.com/legal/privacy/ at depth 2 - Elapsed Time: 1.46 seconds
Crawling https://www.apple.com/choose-country-region/ at depth 2 - Elapsed Time: 3.47 seconds
Crawling https://locate.apple.com/ at depth 2 - Elapsed Time: 1.43 seconds
Crawling https://www.apple.com/retail/ at depth 2 - Elapsed Time: 4.66 seconds
Crawling https://apps.apple.com/us/app/cheap-flights-wayaway/id606870241 at depth 2 - Elapsed Time: 2.40 secon

DYNAMIC WEB CRAWLER

In [9]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return []

# Function for dynamically adaptive crawling
def dynamic_crawl(seed_url, max_depth):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds")

            # You might perform specific actions here based on the page content

            for link in links:
                if link and link.startswith('http') and link not in visited:
                    pages_to_visit.append((link, depth + 1))

    return total_time / num_pages if num_pages > 0 else 0

# Example usage
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl

start_time_total = time.time()
avg_time = dynamic_crawl(seed_url, max_crawl_depth)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")


Crawling https://kayak.com at depth 1 - Elapsed Time: 3.13 seconds
Crawling https://www.kayak.com.ar/ at depth 2 - Elapsed Time: 1.64 seconds
Crawling https://www.kayak.com.au/ at depth 2 - Elapsed Time: 2.23 seconds
Crawling https://www.be.kayak.com/ at depth 2 - Elapsed Time: 1.58 seconds
Crawling https://www.kayak.bo/ at depth 2 - Elapsed Time: 1.92 seconds
Crawling https://www.kayak.com.br/ at depth 2 - Elapsed Time: 1.70 seconds
Crawling https://www.ca.kayak.com/ at depth 2 - Elapsed Time: 2.60 seconds
Crawling https://www.kayak.cl/ at depth 2 - Elapsed Time: 1.69 seconds
Crawling https://www.cn.kayak.com/ at depth 2 - Elapsed Time: 1.73 seconds
Crawling https://www.kayak.com.co/ at depth 2 - Elapsed Time: 1.63 seconds
Crawling https://www.kayak.co.cr/ at depth 2 - Elapsed Time: 1.81 seconds
Crawling https://www.kayak.dk/ at depth 2 - Elapsed Time: 1.87 seconds
Crawling https://www.kayak.de/ at depth 2 - Elapsed Time: 1.73 seconds
Crawling https://www.kayak.com.ec/ at depth 2 - El

ADD SOME FUNCTIONALITIES LIKE FREQUENCY SEARCH AND PAGE RANKING 

In [11]:

import requests
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
from collections import deque

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return soup.get_text(), links
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return '', []

# Function for dynamically adaptive crawling with keyword frequency analysis
def crawl_with_keyword_analysis(seed_url, max_depth, keywords):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    keyword_count = defaultdict(int)

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            page_text, links = get_links(current_url)
            print(f"Crawling {current_url} at depth {depth}")

            # Analyze page text for keyword frequency
            for keyword in keywords:
                keyword_count[keyword] += page_text.lower().count(keyword.lower())

            for link in links:
                if link and link.startswith('http') and link not in visited:
                    pages_to_visit.append((link, depth + 1))

    return keyword_count

# Example usage for keyword frequency analysis
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl
search_keywords = ['example', 'website', 'analysis']  # Keywords to search for

keyword_freq = crawl_with_keyword_analysis(seed_url, max_crawl_depth, search_keywords)

# Display keyword frequencies
for keyword, count in keyword_freq.items():
    print(f"Keyword: '{keyword}' - Frequency: {count}")


Crawling https://kayak.com at depth 1
Crawling https://www.kayak.com.ar/ at depth 2
Crawling https://www.kayak.com.au/ at depth 2
Crawling https://www.be.kayak.com/ at depth 2
Crawling https://www.kayak.bo/ at depth 2
Crawling https://www.kayak.com.br/ at depth 2
Crawling https://www.ca.kayak.com/ at depth 2
Crawling https://www.kayak.cl/ at depth 2
Crawling https://www.cn.kayak.com/ at depth 2
Crawling https://www.kayak.com.co/ at depth 2
Crawling https://www.kayak.co.cr/ at depth 2
Crawling https://www.kayak.dk/ at depth 2
Crawling https://www.kayak.de/ at depth 2
Crawling https://www.kayak.com.ec/ at depth 2
Crawling https://www.kayak.com.sv/ at depth 2
Crawling https://www.kayak.es/ at depth 2
Crawling https://www.kayak.fr/ at depth 2
Crawling https://www.gr.kayak.com/ at depth 2
Crawling https://www.kayak.com.gt/ at depth 2
Crawling https://www.kayak.com.hn/ at depth 2
Crawling https://www.kayak.com.hk/ at depth 2
Crawling https://www.kayak.co.in/ at depth 2
Crawling https://www.k

PAGE RANKING FEATURE

In [13]:
import requests
from bs4 import BeautifulSoup
from collections import deque, defaultdict

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return []

# Function for dynamically adaptive crawling with basic page ranking
def crawl_with_page_rank(seed_url, max_depth):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    inbound_links = defaultdict(int)

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            links = get_links(current_url)
            print(f"Crawling {current_url} at depth {depth}")

            # Count inbound links
            for link in links:
                if link and link.startswith('http') and link not in visited:
                    inbound_links[link] += 1
                    pages_to_visit.append((link, depth + 1))

    return inbound_links

# Example usage for basic page ranking
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl

page_rank = crawl_with_page_rank(seed_url, max_crawl_depth)

# Display page ranks
for page, rank in sorted(page_rank.items(), key=lambda x: x[1], reverse=True):
    print(f"Page: {page} - Inbound Links: {rank}")


Crawling https://kayak.com at depth 1
Crawling https://www.kayak.com.ar/ at depth 2
Crawling https://www.kayak.com.au/ at depth 2
Crawling https://www.be.kayak.com/ at depth 2
Crawling https://www.kayak.bo/ at depth 2
Crawling https://www.kayak.com.br/ at depth 2
Crawling https://www.ca.kayak.com/ at depth 2
Crawling https://www.kayak.cl/ at depth 2
Crawling https://www.cn.kayak.com/ at depth 2
Crawling https://www.kayak.com.co/ at depth 2
Crawling https://www.kayak.co.cr/ at depth 2
Crawling https://www.kayak.dk/ at depth 2
Crawling https://www.kayak.de/ at depth 2
Crawling https://www.kayak.com.ec/ at depth 2
Crawling https://www.kayak.com.sv/ at depth 2
Crawling https://www.kayak.es/ at depth 2
Crawling https://www.kayak.fr/ at depth 2
Crawling https://www.gr.kayak.com/ at depth 2
Crawling https://www.kayak.com.gt/ at depth 2
Crawling https://www.kayak.com.hn/ at depth 2
Crawling https://www.kayak.com.hk/ at depth 2
Crawling https://www.kayak.co.in/ at depth 2
Crawling https://www.k

TAKING INPUT FROM DICTIONARY.TXT

In [5]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
import csv

# Function to get all links from a page
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links, soup.title.text if soup.title else "No Title Found", soup.get_text()
    except Exception as e:
        print(f"Error getting links from {url}: {e}")
        return [], "No Title Found", ""

# Function for dynamically adaptive crawling with prioritized URL ordering, search frequency, and page ranking
def prioritized_crawl_with_frequency_and_ranking(seed_url, max_depth, search_keywords):
    visited = set()
    pages_to_visit = deque([(seed_url, 1)])  # Queue to track URL and depth
    total_time = 0
    num_pages = 0
    use_bfs = True  # Start with BFS
    keyword_frequency = {keyword: 0 for keyword in search_keywords}
    page_ranks = {}  # Dictionary to hold page ranks

    while pages_to_visit:
        current_url, depth = pages_to_visit.popleft()
        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)
            start_time = time.time()
            links, title, page_text = get_links(current_url)
            end_time = time.time()
            elapsed_time = end_time - start_time
            total_time += elapsed_time
            num_pages += 1

            print(f"Crawling {current_url} at depth {depth} - Elapsed Time: {elapsed_time:.2f} seconds - Title: {title}")

            # Count keyword frequencies in page content
            for keyword in search_keywords:
                keyword_frequency[keyword] += page_text.lower().count(keyword.lower())

            # Assign page rank based on depth
            if depth not in page_ranks:
                page_ranks[depth] = []
            page_ranks[depth].append(current_url)

            # Switch to DFS if the number of links exceeds a threshold
            if use_bfs and len(links) > 10:
                use_bfs = False
                print("Switching from BFS to DFS")

            # Sort links based on URL length before adding to the queue
            sorted_links = sorted(links, key=lambda x: len(x))

            # Add links to visit based on the chosen algorithm (BFS/DFS)
            for link in sorted_links:
                if link and link.startswith('http') and link not in visited:
                    if use_bfs:
                        pages_to_visit.append((link, depth + 1))
                    else:
                        pages_to_visit.appendleft((link, depth + 1))

    # Assign scores based on page depth (page ranking)
    rank_score = max_depth
    ranked_pages = {}
    for depth in range(max_depth, 0, -1):
        if depth in page_ranks:
            for url in page_ranks[depth]:
                ranked_pages[url] = rank_score
                rank_score -= 1

    return total_time / num_pages if num_pages > 0 else 0, keyword_frequency

# Function to read keywords from a file
def read_keywords_from_file(file_name):
    with open(file_name, 'r') as file:
        keywords = [line.strip() for line in file.readlines()]
    return keywords

# Example usage with prioritized URL ordering, search frequency, and page ranking
seed_url = 'https://kayak.com'
max_crawl_depth = 3  # Set the maximum depth to crawl

# Read keywords from a file
search_keywords = read_keywords_from_file('dictionary.txt')

start_time_total = time.time()
avg_time, keyword_freq = prioritized_crawl_with_frequency_and_ranking(seed_url, max_crawl_depth, search_keywords)
end_time_total = time.time()
total_elapsed_time = end_time_total - start_time_total

print(f"Total time taken: {total_elapsed_time:.2f} seconds")
print(f"Average time per page: {avg_time:.2f} seconds")

# Writing keyword frequencies to a CSV file
with open('keyword_frequencies.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Keyword', 'Frequency'])
    for keyword, frequency in keyword_freq.items():
        csv_writer.writerow([keyword.capitalize(), frequency])
    print("Keyword frequencies saved to 'keyword_frequencies.csv'")


Crawling https://kayak.com at depth 1 - Elapsed Time: 1.77 seconds - Title: Search Flights, Hotels & Rental Cars | KAYAK
Switching from BFS to DFS
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535 at depth 2 - Elapsed Time: 0.84 seconds - Title: 
      âKAYAK: Flights, Hotels & Cars on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/kayak-flights-hotels-cars/id305204535?eventid=6472814118 at depth 3 - Elapsed Time: 0.64 seconds - Title: 
      âKAYAK Price Alerts: Stop the what-ifs and start your happy dance.
    
Crawling https://apps.apple.com/us/app/checkfelix-fl%C3%BCge-hotels-autos/id492728058 at depth 3 - Elapsed Time: 0.69 seconds - Title: 
      âcheckfelix: FlÃ¼ge Hotels Autos on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/mundi-voos-hot%C3%A9is-e-carros/id600097979 at depth 3 - Elapsed Time: 0.73 seconds - Title: 
      âMundi: Voos, HotÃ©is e Carros on the AppÂ Store
    
Crawling https://apps.apple.com/us/app/swoo