In [32]:
%pip install markdownify

Note: you may need to restart the kernel to use updated packages.


In [39]:
import os
import json
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import time
from datetime import datetime, timezone
from markdownify import markdownify as md
import re
from bs4 import BeautifulSoup

In [34]:
START_URL = "https://cloud.ru/docs/tutorials-evolution/list/index"
BASE_URL = "https://cloud.ru"
TOPICS_PREFIX = "/docs/tutorials-evolution/list/topics/"
OUTPUT_FILE = "cloud_ru_docs.jsonl"


In [35]:
def get_page_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
    }
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ {url}: {e}")
        return None

In [36]:
def extract_topic_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()

    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith(TOPICS_PREFIX):
            full_url = urljoin(BASE_URL, href)
            links.add(full_url)
    return sorted(links)

In [43]:
def extract_title_and_content(html):
    soup = BeautifulSoup(html, 'html.parser')

    for selector in [
        'header', 'nav', 'footer', '.sidebar', 'script', 'style',
        '.breadcrumbs', '.toc', '.header-anchor', '.edit-page-link',
        '.feedback-form', '.article-meta', '.pagination',
        # –£–¥–∞–ª—è–µ–º —Ç–∞–∫–∂–µ —Ñ—É—Ç–µ—Ä —Å —é—Ä. —Å—Å—ã–ª–∫–∞–º–∏ –∏ –∫–æ–ø–∏—Ä–∞–π—Ç–æ–º
        'p:contains("¬© 2025 Cloud.ru")',
        'a[href*="documents"]', 'a[href*="technical-support"]'
    ]:
        for tag in soup.select(selector):
            tag.decompose()

    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "–ë–µ–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞"

    main_content = (
        soup.find('article') or
        soup.find('main') or
        soup.select_one('.content') or
        soup
    )

    def convert_tag(tag):
        if not hasattr(tag, 'name'):
            return str(tag).strip()

        if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(tag.name[1])
            text = tag.get_text().strip()
            return f"{'#' * level} {text}" if text else ""

        elif tag.name == 'p':
            return tag.get_text().strip()

        elif tag.name in ['ul', 'ol']:
            items = []
            for li in tag.find_all('li', recursive=False):
                prefix = '- ' if tag.name == 'ul' else f"{len(items)+1}. "
                inner = ''.join(convert_tag(child) for child in li.children)
                items.append(prefix + inner.strip())
            return '\n'.join(items)

        elif tag.name == 'pre':
            code_text = tag.get_text()
            if code_text.strip():
                return f"```\n{code_text.rstrip()}\n```"
            return ""

        elif tag.name == 'code':
            code_text = tag.get_text()
            if code_text.strip():
                return f"`{code_text}`"
            return ""

        elif tag.name == 'a':
            href = tag.get('href', '').strip()
            text = tag.get_text().strip()
            if href and text:
                if href.startswith('/'):
                    href = 'https://cloud.ru' + href
                return f"[{text}]({href})"
            return text

        elif hasattr(tag, 'children'):
            return ''.join(convert_tag(child) for child in tag.children)

        else:
            return tag.get_text() if tag.string else ""

    blocks = []
    for child in main_content.children:
        if hasattr(child, 'name') and child.name:
            block = convert_tag(child).strip()
            if block:
                blocks.append(block)

    markdown_text = '\n\n'.join(blocks)

    lines = markdown_text.splitlines()
    cleaned_lines = []
    for line in lines:
        # –£–¥–∞–ª—è–µ–º —è–≤–Ω—ã–π —à—É–º
        if re.search(r'\$!\.\$|—Ä—É–∫–æ–≤–æ–¥—Å—Ç–≤–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –æ–±–ª–∞–∫–æ|–¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è –ü—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏–µ', line, re.IGNORECASE):
            continue
        cleaned_lines.append(line)

    markdown_text = '\n'.join(cleaned_lines).strip()

    markdown_text = re.sub(r'([–∞-—è–ê-–Øa-zA-Z])([–ê-–Ø][–∞-—è])', r'\1 \2', markdown_text)

    return title, markdown_text

In [45]:
def main():
    print("üì• –ó–∞–≥—Ä—É–∂–∞–µ–º —Å—Ç–∞—Ä—Ç–æ–≤—É—é —Å—Ç—Ä–∞–Ω–∏—Ü—É...")
    index_html = get_page_html(START_URL)
    if not index_html:
        print("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Å—Ç–∞—Ä—Ç–æ–≤—É—é —Å—Ç—Ä–∞–Ω–∏—Ü—É.")
        return

    print("–ò–∑–≤–ª–µ–∫–∞–µ–º —Å—Å—ã–ª–∫–∏ –Ω–∞ —Å—Ç–∞—Ç—å–∏...")
    topic_urls = extract_topic_links(index_html)
    if not topic_urls:
        print("–ù–µ –Ω–∞–π–¥–µ–Ω–æ –Ω–∏ –æ–¥–Ω–æ–π —Å—Å—ã–ª–∫–∏ –Ω–∞ —Å—Ç–∞—Ç—å–∏.")
        return

    print(f"–ù–∞–π–¥–µ–Ω–æ {len(topic_urls)} —Å—Ç–∞—Ç–µ–π. –ù–∞—á–∏–Ω–∞–µ–º —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ...\n")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_f:
        for i, url in enumerate(topic_urls, 1):
            print(f"[{i}/{len(topic_urls)}] {url}")
            html = get_page_html(url)
            if not html:
                continue

            title, content = extract_title_and_content(html)

            path = urlparse(url).path
            section = path.strip('/').split('/')[-1].replace('-', ' ').title()

            record = {
                "url": url,
                "title": title,
                "content": content,
                "section": section,
                "source": "cloud.ru",
                "timestamp": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
            }

            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
            time.sleep(0.3)

    print(f"\n–ì–æ—Ç–æ–≤–æ! –í—Å–µ —Å—Ç–∞—Ç—å–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {OUTPUT_FILE}")


In [46]:
if __name__ == "__main__":
    main()

üì• –ó–∞–≥—Ä—É–∂–∞–µ–º —Å—Ç–∞—Ä—Ç–æ–≤—É—é —Å—Ç—Ä–∞–Ω–∏—Ü—É...
–ò–∑–≤–ª–µ–∫–∞–µ–º —Å—Å—ã–ª–∫–∏ –Ω–∞ —Å—Ç–∞—Ç—å–∏...
–ù–∞–π–¥–µ–Ω–æ 108 —Å—Ç–∞—Ç–µ–π. –ù–∞—á–∏–Ω–∞–µ–º —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ...

[1/108] https://cloud.ru/docs/tutorials-evolution/list/topics/arenadata-db__adbc-backup?source-platform=Evolution
[2/108] https://cloud.ru/docs/tutorials-evolution/list/topics/arenadata-db__dbeaver?source-platform=Evolution
[3/108] https://cloud.ru/docs/tutorials-evolution/list/topics/arenadata-db__managed-bi?source-platform=Evolution
[4/108] https://cloud.ru/docs/tutorials-evolution/list/topics/arenadata-db__vm-local-ip?source-platform=Evolution
[5/108] https://cloud.ru/docs/tutorials-evolution/list/topics/bare-metal__1c_deploy?source-platform=Evolution
[6/108] https://cloud.ru/docs/tutorials-evolution/list/topics/bare-metal__highload_app?source-platform=Evolution
[7/108] https://cloud.ru/docs/tutorials-evolution/list/topics/bare-metal__k3s?source-platform=Evolution
[8/108] https://cloud.ru/docs/t