In [None]:
!pip install selenium
!apt-get update
!apt-get install chromium-browser -y

Current Page Only


In [None]:
!pip install selenium -q
!apt-get update -q
!apt-get install chromium-browser -y

# Enhanced Offline Webpage Archiver for Google Colab
# Handles lazy loading, loading screens, and dynamic content

import os
import re
import json
import time
import zipfile
import requests
from urllib.parse import urljoin, urlparse, quote
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import base64
from pathlib import Path
import shutil
from datetime import datetime

# Install required packages
!pip install selenium beautifulsoup4 requests pillow -q

# Setup Chrome WebDriver for Colab
!apt-get update >/dev/null 2>&1
!apt-get install -y chromium-browser >/dev/null 2>&1

class EnhancedWebpageArchiver:
    def __init__(self, base_url, wait_time=10, scroll_pause=2):
        self.base_url = base_url
        self.domain = urlparse(base_url).netloc
        self.archive_dir = f"archive_{self.domain}_{int(time.time())}"
        self.assets_dir = os.path.join(self.archive_dir, "assets")
        self.downloaded_urls = set()
        self.url_mapping = {}
        self.wait_time = wait_time
        self.scroll_pause = scroll_pause

        # Create directories
        os.makedirs(self.archive_dir, exist_ok=True)
        os.makedirs(self.assets_dir, exist_ok=True)
        os.makedirs(os.path.join(self.assets_dir, "css"), exist_ok=True)
        os.makedirs(os.path.join(self.assets_dir, "js"), exist_ok=True)
        os.makedirs(os.path.join(self.assets_dir, "images"), exist_ok=True)
        os.makedirs(os.path.join(self.assets_dir, "fonts"), exist_ok=True)
        os.makedirs(os.path.join(self.assets_dir, "other"), exist_ok=True)

        # Setup Selenium WebDriver
        self.setup_webdriver()

    def setup_webdriver(self):
        """Setup Chrome WebDriver with enhanced options for dynamic content"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-web-security")
        chrome_options.add_argument("--allow-running-insecure-content")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # Disable images initially to load faster, we'll enable them later
        prefs = {
            "profile.managed_default_content_settings.images": 1,
            "profile.default_content_setting_values.notifications": 2
        }
        chrome_options.add_experimental_option("prefs", prefs)

        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.driver.set_page_load_timeout(60)  # Increased timeout

    def wait_for_page_load(self):
        """Enhanced waiting mechanism for page load and dynamic content"""
        print("⏳ Waiting for initial page load...")

        try:
            # Wait for basic DOM to be ready
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Wait for document ready state
            WebDriverWait(self.driver, self.wait_time).until(
                lambda driver: driver.execute_script("return document.readyState") == "complete"
            )

            print("✅ Basic page load complete")

            # Check for common loading indicators and wait for them to disappear
            loading_selectors = [
                '[class*="loading"]',
                '[class*="loader"]',
                '[class*="spinner"]',
                '[id*="loading"]',
                '[id*="loader"]',
                '.loading-screen',
                '.loader-container',
                '.spinner-container',
                '[class*="preloader"]',
                '[class*="load-mask"]',
                '.loading-overlay'
            ]

            print("🔍 Checking for loading screens...")
            for selector in loading_selectors:
                try:
                    loading_elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if loading_elements:
                        print(f"Found loading element: {selector}, waiting for it to disappear...")
                        WebDriverWait(self.driver, 15).until(
                            EC.invisibility_of_element_located((By.CSS_SELECTOR, selector))
                        )
                        print(f"✅ Loading element {selector} disappeared")
                        break
                except:
                    continue

            # Additional wait for JavaScript to finish
            print("⏳ Waiting for JavaScript to finish...")
            time.sleep(3)

            # Wait for any pending network requests (check for active XHR)
            self.wait_for_network_idle()

        except Exception as e:
            print(f"⚠️ Page load wait completed with some issues: {str(e)}")
            print("Continuing with current page state...")

    def wait_for_network_idle(self):
        """Wait for network to be idle (no pending requests)"""
        try:
            # Check for jQuery if it exists
            jquery_active = self.driver.execute_script("""
                if (typeof jQuery !== 'undefined') {
                    return jQuery.active;
                }
                return 0;
            """)

            if jquery_active > 0:
                print(f"⏳ Waiting for {jquery_active} jQuery requests to complete...")
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return typeof jQuery !== 'undefined' ? jQuery.active : 0") == 0
                )

            # Wait a bit more for any remaining async operations
            time.sleep(2)
            print("✅ Network appears idle")

        except Exception as e:
            print(f"⚠️ Network idle check completed: {str(e)}")

    def handle_lazy_loading(self):
        """Trigger lazy loading by scrolling through the page"""
        print("🔄 Handling lazy loading by scrolling...")

        try:
            # Get initial page height
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            scroll_position = 0

            while True:
                # Scroll down in increments
                scroll_increment = 500
                scroll_position += scroll_increment

                self.driver.execute_script(f"window.scrollTo(0, {scroll_position});")
                time.sleep(self.scroll_pause)

                # Check if we've reached the bottom
                current_height = self.driver.execute_script("return document.body.scrollHeight")
                window_height = self.driver.execute_script("return window.innerHeight")

                if scroll_position >= current_height - window_height:
                    # We've reached the bottom, check if content has grown
                    time.sleep(2)  # Wait for any lazy content to load
                    new_height = self.driver.execute_script("return document.body.scrollHeight")

                    if new_height == current_height:
                        # No more content loaded, we're done
                        break
                    else:
                        # More content loaded, continue
                        last_height = new_height
                        print(f"📈 Page height increased to {new_height}px, continuing...")

            # Scroll back to top
            self.driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1)

            # Try to trigger any remaining lazy-loaded images
            self.trigger_lazy_images()

            print("✅ Lazy loading handling complete")

        except Exception as e:
            print(f"⚠️ Lazy loading handling completed with issues: {str(e)}")

    def trigger_lazy_images(self):
        """Specifically trigger lazy-loaded images"""
        try:
            # Common lazy loading attributes
            lazy_selectors = [
                'img[data-src]',
                'img[data-lazy]',
                'img[data-original]',
                'img[loading="lazy"]',
                'img[data-srcset]',
                '[data-bg]',
                '[data-background]'
            ]

            for selector in lazy_selectors:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    try:
                        # Scroll to element to trigger loading
                        self.driver.execute_script("arguments[0].scrollIntoView();", element)
                        time.sleep(0.5)

                        # Try to trigger loading by hovering
                        ActionChains(self.driver).move_to_element(element).perform()
                        time.sleep(0.5)

                    except:
                        continue

            # Final wait for images to load
            time.sleep(3)
            print("✅ Lazy image loading triggered")

        except Exception as e:
            print(f"⚠️ Lazy image triggering completed: {str(e)}")

    def wait_for_specific_content(self, content_selectors=None):
        """Wait for specific content elements to appear"""
        if not content_selectors:
            # Default selectors for common content
            content_selectors = [
                'main',
                'article',
                '.content',
                '#content',
                '.main-content',
                '.post-content',
                '.article-body'
            ]

        print("🎯 Waiting for main content to appear...")

        for selector in content_selectors:
            try:
                element = WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                if element and element.text.strip():
                    print(f"✅ Found content with selector: {selector}")
                    return True
            except:
                continue

        print("⚠️ Specific content selectors not found, proceeding anyway...")
        return False

    def sanitize_filename(self, filename):
        """Sanitize filename for file system"""
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
        filename = filename[:200]
        return filename or "unnamed_file"

    def get_file_extension(self, url, content_type=None):
        """Determine file extension from URL or content type"""
        parsed_url = urlparse(url)
        path = parsed_url.path.lower()

        if path.endswith(('.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg',
                         '.woff', '.woff2', '.ttf', '.eot', '.ico', '.pdf', '.mp4', '.mp3')):
            return os.path.splitext(path)[1]

        if content_type:
            content_type = content_type.lower()
            if 'css' in content_type:
                return '.css'
            elif 'javascript' in content_type:
                return '.js'
            elif 'image/png' in content_type:
                return '.png'
            elif 'image/jpeg' in content_type:
                return '.jpg'
            elif 'image/gif' in content_type:
                return '.gif'
            elif 'image/svg' in content_type:
                return '.svg'
            elif 'font/woff' in content_type:
                return '.woff'
            elif 'font/woff2' in content_type:
                return '.woff2'

        return '.html'

    def download_asset(self, url, referer_url=None):
        """Download an asset and return the local path"""
        if url in self.downloaded_urls:
            return self.url_mapping.get(url)

        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            if referer_url:
                headers['Referer'] = referer_url

            response = requests.get(url, headers=headers, timeout=30, stream=True)
            response.raise_for_status()

            # Determine file type and directory
            content_type = response.headers.get('content-type', '').lower()
            extension = self.get_file_extension(url, content_type)

            if extension in ['.css']:
                subdir = "css"
            elif extension in ['.js']:
                subdir = "js"
            elif extension in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.ico']:
                subdir = "images"
            elif extension in ['.woff', '.woff2', '.ttf', '.eot']:
                subdir = "fonts"
            else:
                subdir = "other"

            # Create filename
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path) or "index"
            filename = self.sanitize_filename(filename)

            if not filename.endswith(extension):
                filename += extension

            # Ensure unique filename
            counter = 1
            original_filename = filename
            while os.path.exists(os.path.join(self.assets_dir, subdir, filename)):
                name, ext = os.path.splitext(original_filename)
                filename = f"{name}_{counter}{ext}"
                counter += 1

            local_path = os.path.join(self.assets_dir, subdir, filename)

            # Download file
            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            # Store mapping
            relative_path = f"assets/{subdir}/{filename}"
            self.downloaded_urls.add(url)
            self.url_mapping[url] = relative_path

            print(f"Downloaded: {url} -> {relative_path}")

            # If it's CSS, process it for additional assets
            if extension == '.css':
                self.process_css_file(local_path, url)

            return relative_path

        except Exception as e:
            print(f"Failed to download {url}: {str(e)}")
            return None

    def process_css_file(self, css_path, css_url):
        """Process CSS file to download referenced assets"""
        try:
            with open(css_path, 'r', encoding='utf-8', errors='ignore') as f:
                css_content = f.read()

            # Find URLs in CSS
            url_pattern = r'url\(["\']?(.*?)["\']?\)'
            urls = re.findall(url_pattern, css_content, re.IGNORECASE)

            modified_css = css_content

            for asset_url in urls:
                # Skip data URLs
                if asset_url.startswith('data:'):
                    continue

                # Convert to absolute URL
                absolute_url = urljoin(css_url, asset_url)

                # Download asset
                local_path = self.download_asset(absolute_url, css_url)
                if local_path:
                    # Update CSS content
                    modified_css = modified_css.replace(f'url({asset_url})', f'url(../{local_path})')
                    modified_css = modified_css.replace(f'url("{asset_url}")', f'url("../{local_path}")')
                    modified_css = modified_css.replace(f"url('{asset_url}')", f"url('../{local_path}')")

            # Write modified CSS
            with open(css_path, 'w', encoding='utf-8') as f:
                f.write(modified_css)

        except Exception as e:
            print(f"Error processing CSS file {css_path}: {str(e)}")

    def process_html_content(self, html_content, base_url):
        """Process HTML content and download all assets"""
        soup = BeautifulSoup(html_content, 'html.parser')

        # Process different types of assets
        asset_tags = [
            ('link', 'href', ['stylesheet', 'icon', 'shortcut icon']),
            ('script', 'src', None),
            ('img', 'src', None),
            ('img', 'data-src', None),  # Lazy loaded images
            ('img', 'data-original', None),  # Another lazy loading pattern
            ('source', 'src', None),
            ('source', 'srcset', None),
            ('video', 'src', None),
            ('audio', 'src', None),
            ('embed', 'src', None),
            ('object', 'data', None),
            ('iframe', 'src', None),
        ]

        for tag_name, attr_name, rel_types in asset_tags:
            tags = soup.find_all(tag_name)

            for tag in tags:
                # Check rel attribute if specified
                if rel_types and tag_name == 'link':
                    rel = tag.get('rel', [])
                    if isinstance(rel, str):
                        rel = [rel]
                    if not any(r in rel_types for r in rel):
                        continue

                asset_url = tag.get(attr_name)
                if not asset_url or asset_url.startswith('data:') or asset_url.startswith('#'):
                    continue

                # Handle srcset attribute (multiple URLs)
                if attr_name == 'srcset':
                    srcset_urls = []
                    for src_desc in asset_url.split(','):
                        src_url = src_desc.strip().split()[0]
                        absolute_url = urljoin(base_url, src_url)
                        local_path = self.download_asset(absolute_url, base_url)
                        if local_path:
                            srcset_urls.append(src_desc.replace(src_url, local_path))
                    if srcset_urls:
                        tag[attr_name] = ', '.join(srcset_urls)
                else:
                    # Convert to absolute URL
                    absolute_url = urljoin(base_url, asset_url)

                    # Download asset
                    local_path = self.download_asset(absolute_url, base_url)
                    if local_path:
                        tag[attr_name] = local_path

                # For lazy-loaded images, also set the src attribute
                if tag_name == 'img' and attr_name in ['data-src', 'data-original'] and local_path:
                    tag['src'] = local_path

        # Process inline styles
        for tag in soup.find_all(style=True):
            style_content = tag['style']
            # Find URLs in inline styles
            url_pattern = r'url\(["\']?(.*?)["\']?\)'
            urls = re.findall(url_pattern, style_content, re.IGNORECASE)

            for asset_url in urls:
                if asset_url.startswith('data:'):
                    continue

                absolute_url = urljoin(base_url, asset_url)
                local_path = self.download_asset(absolute_url, base_url)
                if local_path:
                    style_content = style_content.replace(asset_url, local_path)

            tag['style'] = style_content

        # Process style tags
        for style_tag in soup.find_all('style'):
            if style_tag.string:
                css_content = style_tag.string
                url_pattern = r'url\(["\']?(.*?)["\']?\)'
                urls = re.findall(url_pattern, css_content, re.IGNORECASE)

                for asset_url in urls:
                    if asset_url.startswith('data:'):
                        continue

                    absolute_url = urljoin(base_url, asset_url)
                    local_path = self.download_asset(absolute_url, base_url)
                    if local_path:
                        css_content = css_content.replace(asset_url, local_path)

                style_tag.string = css_content

        return str(soup)

    def take_screenshot(self):
        """Take a screenshot of the webpage after content is loaded"""
        try:
            print("📸 Taking screenshots...")

            # Take screenshot of current viewport
            screenshot_path = os.path.join(self.archive_dir, "screenshot.png")
            self.driver.save_screenshot(screenshot_path)

            # Get page dimensions and take full page screenshot
            total_height = self.driver.execute_script("return document.body.scrollHeight")
            total_width = self.driver.execute_script("return document.body.scrollWidth")

            # Set window size to capture full page
            self.driver.set_window_size(max(1920, total_width), total_height)
            time.sleep(2)

            full_screenshot_path = os.path.join(self.archive_dir, "screenshot_full.png")
            self.driver.save_screenshot(full_screenshot_path)

            # Reset window size
            self.driver.set_window_size(1920, 1080)

            print(f"✅ Screenshots saved: {screenshot_path}, {full_screenshot_path}")

        except Exception as e:
            print(f"⚠️ Failed to take screenshot: {str(e)}")

    def create_archive(self, content_selectors=None):
        """Create the complete archive with enhanced loading handling"""
        print(f"🚀 Starting enhanced archive of: {self.base_url}")
        print(f"📁 Archive directory: {self.archive_dir}")

        try:
            # Load page with Selenium first for dynamic content
            print("🌐 Loading page with Selenium...")
            self.driver.get(self.base_url)

            # Enhanced waiting mechanism
            self.wait_for_page_load()

            # Wait for specific content if selectors provided
            if content_selectors:
                self.wait_for_specific_content(content_selectors)

            # Handle lazy loading
            self.handle_lazy_loading()

            # Get the final HTML after all content is loaded
            final_html = self.driver.page_source

            # Take screenshot after everything is loaded
            self.take_screenshot()

            # Process HTML and download assets
            print("⚙️ Processing HTML and downloading assets...")
            processed_html = self.process_html_content(final_html, self.base_url)

            # Save main HTML file
            html_filename = os.path.join(self.archive_dir, "index.html")
            with open(html_filename, 'w', encoding='utf-8') as f:
                f.write(processed_html)

            # Create metadata file
            page_title = "No Title"
            try:
                title_element = self.driver.find_element(By.TAG_NAME, "title")
                page_title = title_element.get_attribute("text") or title_element.text or "No Title"
            except:
                soup = BeautifulSoup(final_html, 'html.parser')
                if soup.title:
                    page_title = soup.title.string or "No Title"

            metadata = {
                'url': self.base_url,
                'archived_at': datetime.now().isoformat(),
                'title': page_title,
                'total_assets': len(self.downloaded_urls),
                'page_height': self.driver.execute_script("return document.body.scrollHeight"),
                'page_width': self.driver.execute_script("return document.body.scrollWidth"),
                'wait_time_used': self.wait_time,
                'scroll_pause_used': self.scroll_pause,
                'assets': list(self.downloaded_urls)
            }

            metadata_file = os.path.join(self.archive_dir, "metadata.json")
            with open(metadata_file, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)

            # Create ZIP archive
            print("📦 Creating ZIP archive...")
            zip_filename = f"{self.archive_dir}.zip"
            with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, dirs, files in os.walk(self.archive_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arc_path = os.path.relpath(file_path, self.archive_dir)
                        zipf.write(file_path, arc_path)

            print(f"\n🎉 Archive completed successfully!")
            print(f"📁 Archive folder: {self.archive_dir}")
            print(f"📦 ZIP file: {zip_filename}")
            print(f"🖼️  Screenshots: screenshot.png, screenshot_full.png")
            print(f"📄 Main file: index.html")
            print(f"📊 Total assets downloaded: {len(self.downloaded_urls)}")
            print(f"📏 Page dimensions: {metadata['page_width']}x{metadata['page_height']}px")

            return self.archive_dir, zip_filename

        except Exception as e:
            print(f"❌ Error creating archive: {str(e)}")
            return None, None
        finally:
            if hasattr(self, 'driver'):
                self.driver.quit()

# Enhanced input section
print("🌐 Enhanced Offline Webpage Archiver")
print("📋 Features: Lazy loading support, loading screen detection, dynamic content")
print("=" * 70)

url = input("Enter the URL to archive: ").strip()

if url:
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    # Get optional parameters
    print("\n⚙️ Optional Settings:")
    wait_time_input = input("Max wait time for page load (default 10s): ").strip()
    wait_time = int(wait_time_input) if wait_time_input.isdigit() else 10

    scroll_pause_input = input("Pause between scrolls (default 2s): ").strip()
    scroll_pause = float(scroll_pause_input) if scroll_pause_input.replace('.', '').isdigit() else 2.0

    content_selectors_input = input("Specific content selectors to wait for (comma-separated, optional): ").strip()
    content_selectors = [s.strip() for s in content_selectors_input.split(',')] if content_selectors_input else None

    print(f"\n🚀 Starting enhanced archive process for: {url}")
    print(f"⏰ Wait time: {wait_time}s, Scroll pause: {scroll_pause}s")

    # Create archiver and start process
    archiver = EnhancedWebpageArchiver(url, wait_time, scroll_pause)
    archive_dir, zip_file = archiver.create_archive(content_selectors)

    if archive_dir:
        print(f"\n📋 Archive Summary:")
        print(f"   • Open 'index.html' in the archive folder to view offline")
        print(f"   • All assets (CSS, JS, images, fonts) are included")
        print(f"   • Lazy-loaded content has been triggered and captured")
        print(f"   • Loading screens were detected and waited for")
        print(f"   • Screenshots show the final loaded state")
        print(f"   • ZIP file created for easy sharing")

        # List contents
        print(f"\n📂 Archive contents:")
        for root, dirs, files in os.walk(archive_dir):
            level = root.replace(archive_dir, '').count(os.sep)
            indent = ' ' * 2 * level
            print(f"{indent}{os.path.basename(root)}/")
            subindent = ' ' * 2 * (level + 1)
            for file in files[:10]:  # Limit display
                print(f"{subindent}{file}")
            if len(files) > 10:
                print(f"{subindent}... and {len(files) - 10} more files")
    else:
        print("❌ Archive process failed. Please check the URL and try again.")
else:
    print("❌ Please provide a valid URL.")

Full Page

In [None]:
#@title # Webpage Archiver
#@markdown Enter a URL to save it as a complete offline archive with all assets, rewritten links, and screenshots.
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from zipfile import ZipFile
from IPython.display import HTML, display
import threading
import queue

# Configuration
URL = ""  #@param {type:"string"}
MAX_DEPTH = 1  #@param {type:"integer"}
DELAY = 1  #@param {type:"number"}  # Delay between requests in seconds
WAIT_TIME = 3  #@param {type:"number"}  # Time to wait for page to load in seconds
INCLUDE_EXTERNAL = False  #@param {type:"boolean"}  # Include external domains
SCREENSHOT_WIDTH = 1280  #@param {type:"integer"}  # Screenshot width in pixels
SCREENSHOT_HEIGHT = 900  #@param {type:"integer"}  # Screenshot height in pixels

# Setup directories
!mkdir -p /content/archive
!mkdir -p /content/archive/assets
!mkdir -p /content/archive/screenshots

# Setup Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'--window-size={SCREENSHOT_WIDTH},{SCREENSHOT_HEIGHT}')

# Initialize the browser
browser = webdriver.Chrome(options=chrome_options)

# Track visited URLs to avoid duplicates
visited_urls = set()
assets_queue = queue.Queue()
pages_to_process = queue.Queue()

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "", filename)

# Function to get domain from URL
def get_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    return domain

# Function to download assets
def download_assets():
    while True:
        asset_info = assets_queue.get()
        if asset_info is None:  # Sentinel value to stop the thread
            assets_queue.task_done()
            break

        asset_url, asset_path, asset_folder = asset_info
        try:
            response = requests.get(asset_url, stream=True, timeout=10)
            if response.status_code == 200:
                # Create directory if it doesn't exist
                os.makedirs(os.path.dirname(asset_path), exist_ok=True)

                # Save the asset
                with open(asset_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                print(f"Downloaded asset: {asset_url}")
        except Exception as e:
            print(f"Error downloading asset {asset_url}: {str(e)}")

        assets_queue.task_done()

# Function to process a webpage
def process_page(url, depth=0):
    if url in visited_urls or depth > MAX_DEPTH:
        return

    visited_urls.add(url)
    print(f"Processing: {url} (Depth: {depth})")

    try:
        # Load the page with Selenium for screenshot and JavaScript rendering
        browser.get(url)
        time.sleep(WAIT_TIME)

        # Take screenshot
        parsed_url = urlparse(url)
        screenshot_path = f"/content/archive/screenshots/{sanitize_filename(parsed_url.netloc + parsed_url.path)}.png"
        browser.save_screenshot(screenshot_path)
        print(f"Saved screenshot: {screenshot_path}")

        # Get page source after JavaScript execution
        page_source = browser.page_source

        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Create a local path for the HTML file
        parsed_url = urlparse(url)
        path = parsed_url.path
        if path == '' or path.endswith('/'):
            path += 'index.html'
        if not path.endswith('.html'):
            path += '.html'

        local_path = f"/content/archive/{sanitize_filename(parsed_url.netloc)}{path}"
        os.makedirs(os.path.dirname(local_path), exist_ok=True)

        # Process all links and assets
        domain = get_domain(url)

        # Process stylesheets
        for link in soup.find_all('link', rel='stylesheet'):
            href = link.get('href')
            if href:
                asset_url = urljoin(url, href)
                asset_filename = os.path.basename(asset_url)
                if not asset_filename:
                    asset_filename = f"style_{len(visited_urls)}.css"

                asset_path = f"/content/archive/assets/{sanitize_filename(parsed_url.netloc)}/css/{asset_filename}"
                link['href'] = f"../assets/{sanitize_filename(parsed_url.netloc)}/css/{asset_filename}"

                assets_queue.put((asset_url, asset_path, "css"))

        # Process scripts
        for script in soup.find_all('script'):
            src = script.get('src')
            if src:
                asset_url = urljoin(url, src)
                asset_filename = os.path.basename(asset_url)
                if not asset_filename:
                    asset_filename = f"script_{len(visited_urls)}.js"

                asset_path = f"/content/archive/assets/{sanitize_filename(parsed_url.netloc)}/js/{asset_filename}"
                script['src'] = f"../assets/{sanitize_filename(parsed_url.netloc)}/js/{asset_filename}"

                assets_queue.put((asset_url, asset_path, "js"))

        # Process images
        for img in soup.find_all('img'):
            src = img.get('src')
            if src:
                asset_url = urljoin(url, src)
                asset_filename = os.path.basename(asset_url)
                if not asset_filename:
                    asset_filename = f"image_{len(visited_urls)}.png"

                asset_path = f"/content/archive/assets/{sanitize_filename(parsed_url.netloc)}/images/{asset_filename}"
                img['src'] = f"../assets/{sanitize_filename(parsed_url.netloc)}/images/{asset_filename}"

                assets_queue.put((asset_url, asset_path, "images"))

        # Process other assets (videos, audio, etc.)
        for tag in soup.find_all(['video', 'audio', 'source']):
            src = tag.get('src')
            if src:
                asset_url = urljoin(url, src)
                asset_filename = os.path.basename(asset_url)
                if not asset_filename:
                    asset_filename = f"media_{len(visited_urls)}"

                asset_path = f"/content/archive/assets/{sanitize_filename(parsed_url.netloc)}/media/{asset_filename}"
                tag['src'] = f"../assets/{sanitize_filename(parsed_url.netloc)}/media/{asset_filename}"

                assets_queue.put((asset_url, asset_path, "media"))

        # Process internal links for crawling
        if depth < MAX_DEPTH:
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                if href and not href.startswith('#') and not href.startswith('javascript:'):
                    new_url = urljoin(url, href)
                    new_domain = get_domain(new_url)

                    # Only follow links from the same domain unless INCLUDE_EXTERNAL is True
                    if new_domain == domain or INCLUDE_EXTERNAL:
                        pages_to_process.put((new_url, depth + 1))

        # Save the modified HTML
        with open(local_path, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print(f"Saved page: {local_path}")

        # Add delay to avoid overwhelming the server
        time.sleep(DELAY)

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

# Start asset download threads
num_download_threads = 5
download_threads = []
for _ in range(num_download_threads):
    thread = threading.Thread(target=download_assets)
    thread.start()
    download_threads.append(thread)

# Add the initial URL to the queue
pages_to_process.put((URL, 0))

# Process pages
try:
    while not pages_to_process.empty():
        url, depth = pages_to_process.get()
        process_page(url, depth)
        pages_to_process.task_done()

        # Small delay to prevent queue checking from consuming too much CPU
        time.sleep(0.1)

        # Check if we've reached the maximum number of URLs to prevent infinite loops
        if len(visited_urls) > 1000:
            print("Reached maximum number of URLs (1000). Stopping.")
            break
finally:
    # Stop all download threads
    for _ in range(num_download_threads):
        assets_queue.put(None)

    # Wait for all download threads to finish
    for thread in download_threads:
        thread.join()

    # Close the browser
    browser.quit()

# Create a ZIP file of the archive
domain_name = sanitize_filename(urlparse(URL).netloc)
zip_path = f"/content/{domain_name}_archive.zip"
with ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files in os.walk("/content/archive"):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, "/content/archive")
            zipf.write(file_path, arcname)

print(f"Archive created: {zip_path}")

# Create a download link
from google.colab import files
files.download(zip_path)

# Display summary
display(HTML(f"""
<h3>Archive Summary</h3>
<p><strong>Original URL:</strong> {URL}</p>
<p><strong>Pages Archived:</strong> {len(visited_urls)}</p>
<p><strong>Archive File:</strong> {domain_name}_archive.zip</p>
<p>The archive includes:</p>
<ul>
  <li>All HTML pages with rewritten links</li>
  <li>All assets (CSS, JavaScript, images, etc.)</li>
  <li>Screenshots of each page</li>
</ul>
"""))