In [1]:
# Install URL2MD4AI if not already installed
# !pip install url2md4ai

# Import required modules
import asyncio
import json
from pathlib import Path
from url2md4ai import URLToMarkdownConverter, URLHasher, Config

print("🚀 URL2MD4AI imported successfully!")
print("📦 Ready for LLM-optimized markdown conversion")


🚀 URL2MD4AI imported successfully!
📦 Ready for LLM-optimized markdown conversion


In [3]:
# Initialize converter with default LLM-optimized configuration
config = Config.from_env()
converter = URLToMarkdownConverter(config)

# Convert a URL to markdown
url = "https://www.satispay.com/en-it/work-at-satispay/open-positions/ffe0b42e-9119-4861-945b-e849e24da206/"
print(f"🔗 Converting: {url}")

# Use async conversion
result = await converter.convert_url(url, output_path=None)

if result.success:
    print("✅ Conversion successful!")
    print(f"📄 Title: {result.title}")
    print(f"📊 Size: {result.file_size:,} characters")
    print(f"⚡ Method: {result.extraction_method}")
    print(f"🎯 Hash: {URLHasher.generate_hash(url)}")
    print(f"⏱️ Time: {result.processing_time:.2f}s")
    
    # Show content preview
    print("\n📄 Content Preview (first 500 chars):")
    print("-" * 60)
    preview = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
    print(preview)
    print("-" * 60)
else:
    print(f"❌ Conversion failed: {result.error}")


[32m2025-06-29 12:16:15.873[0m | [1mINFO    [0m | [36murl2md4ai.converter[0m:[36m__init__[0m:[36m262[0m - [1mConverter initialized with config[0m
[32m2025-06-29 12:16:15.874[0m | [1mINFO    [0m | [36murl2md4ai.converter[0m:[36mconvert_url[0m:[36m279[0m - [1mStarting conversion: https://www.satispay.com/en-it/work-at-satispay/open-positions/ffe0b42e-9119-4861-945b-e849e24da206/[0m


🔗 Converting: https://www.satispay.com/en-it/work-at-satispay/open-positions/ffe0b42e-9119-4861-945b-e849e24da206/


[32m2025-06-29 12:16:19.971[0m | [1mINFO    [0m | [36murl2md4ai.converter[0m:[36mconvert_url[0m:[36m332[0m - [1mMarkdown saved to: output/6eb626801db91c91.md[0m
[32m2025-06-29 12:16:19.972[0m | [1mINFO    [0m | [36murl2md4ai.converter[0m:[36mconvert_url[0m:[36m351[0m - [1mConversion completed: 8966 chars via trafilatura[0m


✅ Conversion successful!
📄 Title: 
📊 Size: 8,966 characters
⚡ Method: trafilatura
🎯 Hash: 6eb626801db91c91
⏱️ Time: 4.10s

📄 Content Preview (first 500 chars):
------------------------------------------------------------
When you visit this website, Satispay may store or retrieve information on your browser by using cookies provided by us or third parties. Such information might be about you, your preferences or your device. If you have consented to, we may also retrieve analytics statistics on your usage of the website and provide you with targeted advertising. You can choose to accept or deny all cookies (other than those necessary for the functioning of this website) by clicking the appropriate button below. ...
------------------------------------------------------------


In [None]:
# Configure for clean LLM-optimized extraction
clean_config = Config.from_env()
clean_config.llm_optimized = True
clean_config.clean_content = True
clean_config.use_trafilatura = True
clean_config.remove_cookie_banners = True
clean_config.remove_ads = True
clean_config.remove_social_media = True

# Configure for raw extraction (no cleaning)
raw_config = Config.from_env()
raw_config.llm_optimized = False
raw_config.clean_content = False
raw_config.use_trafilatura = False

# Create converters
clean_converter = URLToMarkdownConverter(clean_config)
raw_converter = URLToMarkdownConverter(raw_config)

# Test URL with potentially noisy content
test_url = "https://example.com"
print(f"🧪 Testing extraction methods on: {test_url}")

# Clean extraction
print("\n🧹 Clean extraction...")
clean_result = await clean_converter.convert_url(test_url, output_path=None)

# Raw extraction  
print("📄 Raw extraction...")
raw_result = await raw_converter.convert_url(test_url, output_path=None)

# Compare results
if clean_result.success and raw_result.success:
    print("\n📊 Extraction Comparison:")
    print(f"🧹 Clean: {clean_result.file_size:,} chars via {clean_result.extraction_method}")
    print(f"📄 Raw:   {raw_result.file_size:,} chars via {raw_result.extraction_method}")
    
    # Calculate noise reduction
    if raw_result.file_size > 0:
        reduction = ((raw_result.file_size - clean_result.file_size) / raw_result.file_size) * 100
        print(f"🎯 Noise reduction: {reduction:.1f}%")
        
        if reduction > 50:
            print("✅ Significant noise reduction - perfect for LLM processing!")
        elif reduction > 20:
            print("✅ Good noise reduction - suitable for LLM processing")
        else:
            print("ℹ️ Minimal noise reduction - content may be naturally clean")
    
    # Show side-by-side preview
    print("\n🔍 Content Preview Comparison:")
    print("\n🧹 CLEAN VERSION (first 300 chars):")
    clean_preview = clean_result.markdown[:300] + "..." if len(clean_result.markdown) > 300 else clean_result.markdown
    print(clean_preview)
    
    print("\n📄 RAW VERSION (first 300 chars):")
    raw_preview = raw_result.markdown[:300] + "..." if len(raw_result.markdown) > 300 else raw_result.markdown
    print(raw_preview)
else:
    print("❌ One or both extractions failed")
    if not clean_result.success:
        print(f"Clean error: {clean_result.error}")
    if not raw_result.success:
        print(f"Raw error: {raw_result.error}")


In [None]:
# Demonstrate URL hashing for consistent filename generation
sample_urls = [
    "https://example.com/article/how-to-use-ai",
    "https://docs.python.org/3/tutorial/", 
    "https://github.com/microsoft/vscode",
    "https://stackoverflow.com/questions/12345/python-help",
    "https://news.ycombinator.com/item?id=12345678"
]

print("🔗 URL Hashing Examples:")
print("=" * 80)

for i, url in enumerate(sample_urls, 1):
    hash_value = URLHasher.generate_hash(url)
    filename = URLHasher.generate_filename(url, ".md")
    
    print(f"{i}. URL: {url}")
    print(f"   Hash: {hash_value}")
    print(f"   Filename: {filename}")
    print()

# Demonstrate hash consistency
test_url = "https://example.com/test"
hash1 = URLHasher.generate_hash(test_url)
hash2 = URLHasher.generate_hash(test_url)  # Same URL
hash3 = URLHasher.generate_hash(test_url + "/different")  # Different URL

print("🔄 Hash Consistency Test:")
print(f"Same URL twice:    {hash1} == {hash2} -> {hash1 == hash2}")
print(f"Different URL:     {hash1} == {hash3} -> {hash1 == hash3}")
print("✅ Hashes are consistent and unique!")
