<a href="https://colab.research.google.com/github/sinchu71/website-downloader-project/blob/main/website%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install requests beautifulsoup4




In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

# Function to download the page content
def download_page(url, depth, base_dir, visited_urls, crawl_depth_limit):
    if depth > crawl_depth_limit or url in visited_urls:
        return

    # Mark this URL as visited
    visited_urls.add(url)

    # Fetch the page content
    try:
        response = requests.get(url)
        response.raise_for_status()
        page_content = response.text
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return

    # Save HTML content to the local directory
    save_html(url, page_content, base_dir)

    # Parse the HTML to extract links, images, and scripts
    soup = BeautifulSoup(page_content, 'html.parser')

    # Crawl links and download additional resources
    download_resources(url, soup, base_dir, depth, visited_urls, crawl_depth_limit)

    # Find all internal links on the page and visit them recursively
    for link_tag in soup.find_all("a", href=True):
        link = link_tag['href']
        link_url = urljoin(url, link)
        download_page(link_url, depth + 1, base_dir, visited_urls, crawl_depth_limit)

# Function to download resources (images, js)
def download_resources(url, soup, base_dir, depth, visited_urls, crawl_depth_limit):
    # Download images
    for img_tag in soup.find_all("img", src=True):
        img_url = urljoin(url, img_tag['src'])
        download_resource(img_url, base_dir, 'images', visited_urls, crawl_depth_limit, depth)

    # Download JavaScript files
    for script_tag in soup.find_all("script", src=True):
        script_url = urljoin(url, script_tag['src'])
        download_resource(script_url, base_dir, 'js', visited_urls, crawl_depth_limit, depth)

# Helper function to download individual resources (like images, js)
def download_resource(resource_url, base_dir, resource_type, visited_urls, crawl_depth_limit, depth):
    if resource_url in visited_urls or depth > crawl_depth_limit:
        return

    # Mark this resource as visited
    visited_urls.add(resource_url)

    try:
        response = requests.get(resource_url)
        response.raise_for_status()
        content = response.content
    except requests.RequestException as e:
        print(f"Error downloading resource {resource_url}: {e}")
        return

    # Determine the resource directory
    resource_dir = os.path.join(base_dir, resource_type)
    os.makedirs(resource_dir, exist_ok=True)

    # Save the resource locally
    resource_name = os.path.basename(urlparse(resource_url).path)
    resource_path = os.path.join(resource_dir, resource_name)
    with open(resource_path, 'wb') as f:
        f.write(content)
    print(f"Downloaded {resource_url} to {resource_path}")

# Function to save the HTML page
def save_html(url, page_content, base_dir):
    parsed_url = urlparse(url)
    file_name = parsed_url.netloc + parsed_url.path.replace('/', '_') + ".html"
    if file_name.endswith(".html"):
        file_name = file_name.replace(".html_", ".html")
    file_path = os.path.join(base_dir, file_name)
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(page_content)
    print(f"Downloaded HTML for {url} to {file_path}")

# Main function to run the downloader
def download_website(url, crawl_depth):
    base_dir = "offline_site"
    visited_urls = set()
    download_page(url, 0, base_dir, visited_urls, crawl_depth)
    print("Download completed.")

# Example usage - Change these variables
url_to_download = "https://www.example.com"  # Replace with the website you want to download
crawl_depth = 2  # Replace with your desired crawl depth

# Run the download
download_website(url_to_download, crawl_depth)



Downloaded HTML for https://www.example.com to offline_site/www.example.com.html
Downloaded HTML for https://www.iana.org/domains/example to offline_site/www.iana.org_domains_example.html
Downloaded https://www.iana.org/_img/2022/iana-logo-header.svg to offline_site/images/iana-logo-header.svg
Downloaded https://www.iana.org/_js/jquery.js to offline_site/js/jquery.js
Downloaded https://www.iana.org/_js/iana.js to offline_site/js/iana.js
Downloaded HTML for https://www.iana.org/ to offline_site/www.iana.org_.html
Downloaded HTML for https://www.iana.org/domains to offline_site/www.iana.org_domains.html
Downloaded HTML for https://www.iana.org/protocols to offline_site/www.iana.org_protocols.html
Downloaded HTML for https://www.iana.org/numbers to offline_site/www.iana.org_numbers.html
Downloaded https://www.iana.org/_img/2013.1/rir-map.svg to offline_site/images/rir-map.svg
Downloaded HTML for https://www.iana.org/about to offline_site/www.iana.org_about.html
Downloaded HTML for https:/