In [None]:
import requests
from bs4 import BeautifulSoup
import json

In [None]:
BASE_DOCUMENTATION_URL = "https://docs.redhat.com/en/documentation"

def format_product_name(product_name):
    """Convert product name to match documentation URL format."""
    return product_name.lower().replace(" ", "_")

In [None]:
def get_products():
    """Scrapes product names and full names from the products page."""
    response = requests.get(BASE_DOCUMENTATION_URL)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve products page: {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")
    products = {}

    for link in soup.select("a"):  # Adjust the selector based on site structure
        href = link.get("href")
        if href and href.startswith("/en/documentation/"):  # Extract valid product links
            product_name = href.split("/")[-1]  # Extract collection_base_name
            product_full_name = link.text.strip()
            # products[product_name] = product_full_name
            products[product_full_name] = product_name

    return products

In [None]:
def get_versions(product_name, initial_version):
    """Scrape the documentation page to extract available versions."""
    formatted_name = format_product_name(product_name)
    doc_url = f"{BASE_DOCUMENTATION_URL}/{formatted_name}/{initial_version}"
    
    response = requests.get(doc_url)
    if response.status_code != 200:
        print(f"Failed to fetch {doc_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    # Identify the dropdown element containing versions
    version_dropdown = soup.find("select", {"id": "product_version"})  # Adjust selector if needed

    if not version_dropdown:
        print(f"No version dropdown found for {formatted_name}")
        return []

    versions = [option.text.strip() for option in version_dropdown.find_all("option")]
    
    # Sort versions based on numeric parts
    sorted_versions = sorted(versions, key=lambda x: [int(n) for n in x.split(".") if n.isdigit()], reverse=True)
    
    return sorted_versions


In [None]:
def save_product_data(products):
    """Fetch and structure product documentation details, saving each as a separate file."""
    for product_name, initial_version in products.items():
        versions = get_versions(product_name, initial_version)
        if not versions:
            continue

        # Determine store_directive for each version
        version_data = []
        for index, version in enumerate(versions):
            store_directive = "create_or_keep" if index < 2 else "delete"
            version_data.append({
                "version_number": version,
                "store_directive": store_directive,
                "sources": [{"ingestion_type": "redhat_doc", "language": "en-US"}]
            })

        product_data = {
            "collection_base_name": format_product_name(product_name),
            "collection_full_name": product_name,
            "common_sources": [],
            "versions": version_data
        }

        # Save to a JSON file named after the formatted product name
        file_name = f"{format_product_name(product_name)}.json"
        with open(file_name, "w", encoding="utf-8") as f:
            json.dump(product_data, f, indent=2)
        
        print(f"Saved {file_name}")

In [None]:
product_json = save_product_data(get_products())
