In [11]:
import requests
from urllib.parse import urlparse, urljoin
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from tqdm import tqdm

def guess_sitemap_url(base_url):
    """Tries various methods to find the sitemap URL."""
    parsed_url = urlparse(base_url)
    base_url = parsed_url.scheme + "://" + parsed_url.netloc

    # Common sitemap patterns
    common_patterns = ['sitemap.xml', 'sitemap_index.xml', 'sitemap1.xml', 'sitemap_index.xml.gz']
    for pattern in common_patterns:
        url = urljoin(base_url, pattern)
        if check_url(url):
            return url

    # Check robots.txt
    robots_url = urljoin(base_url, 'robots.txt')
    sitemap_url = parse_robots_txt(robots_url)
    if sitemap_url:
        return sitemap_url

    # Check main page HTML for sitemap links
    main_page_sitemap_url = find_sitemap_in_html(base_url)
    if main_page_sitemap_url:
        return main_page_sitemap_url

    return None

def check_url(url):
    """Check if the URL is accessible and returns a status code of 200."""
    response = requests.get(url)
    return response.status_code == 200

def parse_robots_txt(robots_url):
    """Parse the robots.txt file to find sitemap URL."""
    try:
        response = requests.get(robots_url)
        if response.status_code == 200:
            lines = response.text.splitlines()
            for line in lines:
                if line.lower().startswith('sitemap:'):
                    sitemap_url = line.split(':')[1].strip()
                    if check_url(sitemap_url):
                        return sitemap_url
    except Exception as e:
        print(f"Failed to parse robots.txt: {str(e)}")
    return None

def find_sitemap_in_html(url):
    """Inspect the main page HTML to find a sitemap link."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            if 'sitemap' in link['href'].lower():
                full_sitemap_url = urljoin(url, link['href'])
                if check_url(full_sitemap_url):
                    return full_sitemap_url
    except Exception as e:
        print(f"Failed to find sitemap in HTML: {str(e)}")
    return None

def extract_urls_from_sitemap(sitemap_url):
    """Extracts URLs from the given sitemap URL."""
    response = requests.get(sitemap_url)
    xml_content = response.content.decode("utf-8")
    root = ET.ElementTree(ET.fromstring(xml_content)).getroot()
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    url_elements = root.findall('.//ns:url/ns:loc', namespaces=ns)
    urls = [url_element.text for url_element in url_elements]
    return urls

def fetch_url_content(url):
    """Fetches main content for a given URL, handling HTML content, Markdown styled text, including code blocks, and media files."""
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting regular content
        content = []
        for element in soup.select('article, section, div.main-content'):
            paragraphs = element.find_all(['p', 'h1', 'h2', 'h3', 'li'])
            for para in paragraphs:
                text = para.get_text(strip=True)
                if text:
                    content.append(text.replace('\n', ' ').strip())

        # Extracting code blocks
        code_blocks = soup.find_all(['pre', 'code'])
        for block in code_blocks:
            code_text = block.get_text(strip=True)
            if code_text:
                # Adding code text with Markdown code block syntax
                content.append(f"```{block.get('class')[0] if block.get('class') else 'python'}\n{code_text}\n```")
        
        # Extracting images and videos
        media_files = soup.find_all(['img', 'video', 'a'])
        for media in media_files:
            if media.name == 'img' and media.get('src'):
                content.append(f"![Image]({media['src']})")
            elif media.name == 'video' and media.get('src'):
                content.append(f"![Video]({media['src']})")
            elif media.name == 'a' and media.get('href') and any(media['href'].endswith(ext) for ext in ['.pdf', '.zip', '.docx']):
                content.append(f"[Download File]({media['href']})")

        return ' '.join(content), len(' '.join(content))
    except requests.exceptions.RequestException as e:
        return "Error", 0



def fetch_sitemap_content(sitemap_url):
    """Fetches the content of all URLs found in a sitemap sequentially for better progress tracking."""
    urls = extract_urls_from_sitemap(sitemap_url)
    page_texts = []
    total_length = 0

    print(f"Starting to fetch content from {len(urls)} URLs...")

    for url in tqdm(urls, desc="Fetching URLs"):
        text, length = fetch_url_content(url)
        page_texts.append(text)
        total_length += length

    return page_texts, len(urls), total_length

def extract_stats(sitemap_url, save_to_file=False):
    """Extracts stats from sitemap processing and optionally saves to a file."""
    page_contents, num_urls, total_text_length = fetch_sitemap_content(sitemap_url)
    print(f"Total URLs fetched: {num_urls}")
    print(f"Total length of text fetched: {total_text_length}")

    if save_to_file:
        with open('pages.txt', 'w', encoding='utf-8') as f:
            for content in page_contents:
                if content != "Error":
                    f.write(f"{content}\n")
        print("Content saved to 'pages.txt'.")

# Example usage:
# base_url = 'https://www.example.com'
# sitemap_url = guess_sitemap_url(base_url)
# if sitemap_url:
#     extract_stats(sitemap_url, save_to_file=True)
# else:
#     print("Sitemap not found.")



In [13]:
base_url = 'https://kraftful.com'
sitemap_url = guess_sitemap_url(base_url)

if sitemap_url:
    extract_stats(sitemap_url, True)
else:
    print('Sitemap not found')

Starting to fetch content from 37 URLs...


Fetching URLs: 100%|██████████| 37/37 [00:24<00:00,  1.53it/s]

Total URLs fetched: 37
Total length of text fetched: 0
Content saved to 'pages.txt'.



