In [119]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Dict
from pyhtml2pdf import converter
import requests
import os
import re




**Scrape Documentations**

In [None]:
technology_scraping_dict = {
    "docker-compose": {
        "url": "https://docs.docker.com/compose/compose-file/",
        "sub_urls": True,
        "scraping": {
            "html_class": "article",
            "css_class": "class",
            "css_class_name": "prose max-w-none dark:prose-invert"
        }
    },
    "docker": {
        "url": "https://docs.docker.com/reference/dockerfile/",
        "sub_urls": False,
        "scraping": {
            "html_class": "article",
            "css_class": "class",
            "css_class_name": "prose max-w-none dark:prose-invert"
        }
    },
    "spring-boot": [
        {
            "url": "https://docs.spring.io/spring-boot/appendix/application-properties/index.html",
                "sub_urls": False,
                "scraping": {
                    "html_class": "article",
                    "css_class": "class",
                    "css_class_name": "doc"
                },
        },
        {
            "url": "https://docs.spring.io/spring-boot/docs/1.0.1.RELEASE/reference/html/howto-properties-and-configuration.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "class",
                "css_class_name": "chapter"
            }
        }
    ],
    "maven": [
        {
            "url": "https://maven.apache.org/pom.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
        {
            "url": "https://maven.apache.org/ref/3.9.7/maven-model/maven.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
                {
            "url": "https://maven.apache.org/ref/3.9.7/maven-settings/settings.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
        {
            "url": "https://maven.apache.org/ref/3.9.7/maven-core/toolchains.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        }
    ],
    "nodejs": [
        {
            "url": "https://nodejs.org/docs/latest/api/packages.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "id",
                "css_class_name": "apicontent"
            }
        },
        {
            "url": "https://docs.npmjs.com/cli/v10/configuring-npm/package-json",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "class",
                "css_class_name": "Box-sc-g0xbh4-0 jrNUvm"
            }
        },
    ],
    "tsconfig": {
        "url": "https://www.typescriptlang.org/tsconfig/",
        "sub_urls": False,
        "scraping": {
            "html_class": "main",
            "css_class": "role",
            "css_class_name": "main"
        }
    },
    "mysql": [
        {
            "url": "https://dev.mysql.com/doc/refman/8.4/en/server-system-variables.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "id",
                "css_class_name": "docs-main-inner"
            }
        },
        {
            "url": "https://dev.mysql.com/doc/refman/8.4/en/innodb-parameters.html",
                "sub_urls": False,
                "scraping": {
                    "html_class": "div",
                    "css_class": "id",
                    "css_class_name": "docs-main-inner"
                }
        }
    ]
}

In [120]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_sub_urls(url):
    # Fetch the main page
    response = requests.get(url)
    response.raise_for_status()  # Ensures we notice if something goes wrong

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links within the page's <a> tags
    links = soup.find_all('a', href=True)
    # Normalize and filter URLs
    sub_urls = set()
    for link in links:
        # Create a full URL if the link is relative
        full_url = urljoin(url, link['href'])
        if full_url.startswith(url):  # Optional: filter for specific domain
            sub_urls.add(full_url)
    return sub_urls

In [121]:
from xhtml2pdf import pisa
from io import BytesIO

# HTML content
html_content = """
<!DOCTYPE html>
<html>
<body>
    {}
</body>
</html>
"""


def scrape(scraping_values: Dict, tech_dir_name: str):
    main_url = scraping_values["url"]

    if scraping_values["sub_urls"]:
        sub_urls = scrape_sub_urls(main_url)

        for url in sub_urls:
            print(url)

            name = url.split("/")[-2]
            output_file = tech_dir_name + f"/{name}.pdf"

            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            main_body = soup.find(scraping_values["scraping"]["html_class"], {scraping_values["scraping"]["css_class"]: scraping_values["scraping"]["css_class_name"]})

            pdf_output = BytesIO()
            pisa.CreatePDF(html_content.format(main_body), dest=pdf_output, encoding='utf-8')

            if not os.path.exists(output_file):
                # Open a PDF file for writing in binary mode
                with open(output_file, "wb") as pdf_file:
                    # Write the PDF content to the file  
                    pdf_file.write(pdf_output.getvalue())
            else:
                print("File already exists.")
    
    else:
        print(main_url)
        name = "_".join(main_url.split("/")[-2:])
        output_file = tech_dir_name + f"/{name}.pdf"

        response = requests.get(main_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        main_body = soup.find(scraping_values["scraping"]["html_class"], {scraping_values["scraping"]["css_class"]: scraping_values["scraping"]["css_class_name"]})

        pdf_output = BytesIO()
        pisa.CreatePDF(html_content.format(main_body), dest=pdf_output, encoding='utf-8')

        if not os.path.exists(output_file):
            # Open a PDF file for writing in binary mode
            with open(output_file, "wb") as pdf_file:
                # Write the PDF content to the file  
                pdf_file.write(pdf_output.getvalue())
        else:
            print("File already exists.")



In [122]:
for technology, scraping_values in technology_scraping_dict.items():
    print("Scrape: ", technology)

    tech_dir_name = f"../../data/tech_docs/{technology}"

    os.makedirs(tech_dir_name, exist_ok=True)

    if len(os.listdir(tech_dir_name)) > 0:
        print(f"Technology {technology} already scraped.")
        continue

    if isinstance(scraping_values, list):
        for x in scraping_values:
            scrape(scraping_values=x, tech_dir_name=tech_dir_name)

    else:
        scrape(scraping_values=scraping_values, tech_dir_name=tech_dir_name)

Scrape:  docker-compose
Technology docker-compose already scraped.
Scrape:  docker
Technology docker already scraped.
Scrape:  spring-boot
Technology spring-boot already scraped.
Scrape:  maven
Technology maven already scraped.
Scrape:  nodejs
Technology nodejs already scraped.
Scrape:  tsconfig
Technology tsconfig already scraped.
Scrape:  mysql
https://dev.mysql.com/doc/refman/8.4/en/server-system-variables.html
Name:  en_server-system-variables.html
https://dev.mysql.com/doc/refman/8.4/en/innodb-parameters.html
Name:  en_innodb-parameters.html


**Scrape Stack Overflow Posts**

In [128]:
import itertools

technologies = ["docker", "docker-compose", "maven", "spring"]

combinations = list(itertools.combinations(technologies, r=2))

combinations


[('docker', 'docker-compose'),
 ('docker', 'maven'),
 ('docker', 'spring'),
 ('docker-compose', 'maven'),
 ('docker-compose', 'spring'),
 ('maven', 'spring')]

**Scrape Blog Posts**