In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Dict
from pyhtml2pdf import converter
import requests
import os
import re




**Scrape Documentations**

In [None]:
technology_scraping_dict = {
    "docker-compose": {
        "url": "https://docs.docker.com/compose/compose-file/",
        "sub_urls": True,
        "scraping": {
            "html_class": "article",
            "css_class": "class",
            "css_class_name": "prose max-w-none dark:prose-invert"
        }
    },
    "docker": {
        "url": "https://docs.docker.com/reference/dockerfile/",
        "sub_urls": False,
        "scraping": {
            "html_class": "article",
            "css_class": "class",
            "css_class_name": "prose max-w-none dark:prose-invert"
        }
    },
    "spring-boot": [
        {
            "url": "https://docs.spring.io/spring-boot/appendix/application-properties/index.html",
                "sub_urls": False,
                "scraping": {
                    "html_class": "article",
                    "css_class": "class",
                    "css_class_name": "doc"
                },
        },
        {
            "url": "https://docs.spring.io/spring-boot/docs/1.0.1.RELEASE/reference/html/howto-properties-and-configuration.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "class",
                "css_class_name": "chapter"
            }
        }
    ],
    "maven": [
        {
            "url": "https://maven.apache.org/pom.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
        {
            "url": "https://maven.apache.org/ref/3.9.7/maven-model/maven.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
                {
            "url": "https://maven.apache.org/ref/3.9.7/maven-settings/settings.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        },
        {
            "url": "https://maven.apache.org/ref/3.9.7/maven-core/toolchains.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "id",
                "css_class_name": "bodyColumn"
            }
        }
    ],
    "nodejs": [
        {
            "url": "https://nodejs.org/docs/latest/api/packages.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "id",
                "css_class_name": "apicontent"
            }
        },
        {
            "url": "https://docs.npmjs.com/cli/v10/configuring-npm/package-json",
            "sub_urls": False,
            "scraping": {
                "html_class": "main",
                "css_class": "class",
                "css_class_name": "Box-sc-g0xbh4-0 jrNUvm"
            }
        },
    ],
    "tsconfig": {
        "url": "https://www.typescriptlang.org/tsconfig/",
        "sub_urls": False,
        "scraping": {
            "html_class": "main",
            "css_class": "role",
            "css_class_name": "main"
        }
    },
    "mysql": [
        {
            "url": "https://dev.mysql.com/doc/refman/8.4/en/server-system-variables.html",
            "sub_urls": False,
            "scraping": {
                "html_class": "div",
                "css_class": "id",
                "css_class_name": "docs-main-inner"
            }
        },
        {
            "url": "https://dev.mysql.com/doc/refman/8.4/en/innodb-parameters.html",
                "sub_urls": False,
                "scraping": {
                    "html_class": "div",
                    "css_class": "id",
                    "css_class_name": "docs-main-inner"
                }
        }
    ]
}

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_sub_urls(url):
    # Fetch the main page
    response = requests.get(url)
    response.raise_for_status()  # Ensures we notice if something goes wrong

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links within the page's <a> tags
    links = soup.find_all('a', href=True)
    # Normalize and filter URLs
    sub_urls = set()
    for link in links:
        # Create a full URL if the link is relative
        full_url = urljoin(url, link['href'])
        if full_url.startswith(url):  # Optional: filter for specific domain
            sub_urls.add(full_url)
    return sub_urls

In [None]:
from xhtml2pdf import pisa
from io import BytesIO

# HTML content
html_content = """
<!DOCTYPE html>
<html>
<body>
    {}
</body>
</html>
"""


def scrape(scraping_values: Dict, tech_dir_name: str):
    main_url = scraping_values["url"]

    if scraping_values["sub_urls"]:
        sub_urls = scrape_sub_urls(main_url)

        for url in sub_urls:
            print(url)

            name = url.split("/")[-2]
            output_file = tech_dir_name + f"/{name}.pdf"

            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            main_body = soup.find(scraping_values["scraping"]["html_class"], {scraping_values["scraping"]["css_class"]: scraping_values["scraping"]["css_class_name"]})

            pdf_output = BytesIO()
            pisa.CreatePDF(html_content.format(main_body), dest=pdf_output, encoding='utf-8')

            if not os.path.exists(output_file):
                # Open a PDF file for writing in binary mode
                with open(output_file, "wb") as pdf_file:
                    # Write the PDF content to the file  
                    pdf_file.write(pdf_output.getvalue())
            else:
                print("File already exists.")
    
    else:
        print(main_url)
        name = "_".join(main_url.split("/")[-2:])
        output_file = tech_dir_name + f"/{name}.pdf"

        response = requests.get(main_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        main_body = soup.find(scraping_values["scraping"]["html_class"], {scraping_values["scraping"]["css_class"]: scraping_values["scraping"]["css_class_name"]})

        pdf_output = BytesIO()
        pisa.CreatePDF(html_content.format(main_body), dest=pdf_output, encoding='utf-8')

        if not os.path.exists(output_file):
            # Open a PDF file for writing in binary mode
            with open(output_file, "wb") as pdf_file:
                # Write the PDF content to the file  
                pdf_file.write(pdf_output.getvalue())
        else:
            print("File already exists.")



In [None]:
for technology, scraping_values in technology_scraping_dict.items():
    print("Scrape: ", technology)

    tech_dir_name = f"../../data/tech_docs/{technology}"

    os.makedirs(tech_dir_name, exist_ok=True)

    if len(os.listdir(tech_dir_name)) > 0:
        print(f"Technology {technology} already scraped.")
        continue

    if isinstance(scraping_values, list):
        for x in scraping_values:
            scrape(scraping_values=x, tech_dir_name=tech_dir_name)

    else:
        scrape(scraping_values=scraping_values, tech_dir_name=tech_dir_name)

**Scrape Stack Overflow Posts**

In [18]:
from xhtml2pdf import pisa
from io import BytesIO
import glob
import os
import json

html_content = """
<!DOCTYPE html>
<html>
<body>
<div>Title: {}</div>
<div>Post Body: {}</div>
<div>Accepted Answer: {}</div>
<div>Highest Rated Answer: {}</div>
</body>
</html>
"""

for post_file in glob.glob("../../stackoverflow/**"):

    dir_name = post_file.split("/")[-1].split(".json")[0]
    os.makedirs(f"../../data/so_posts/{dir_name}", exist_ok=True)


    with open(post_file, "r", encoding="utf-8") as src:
        data = json.load(src)

    
    for elem in data:
        title = elem["Title"]

        output_file = f"../../data/so_posts/{dir_name}/{title}.pdf"

        try: 
            pdf_output = BytesIO()
            pisa.CreatePDF(
                html_content.format(
                    elem["Title"], 
                    elem["Body"], 
                    elem["accepted_answer"],
                    elem["highest_rated_answer"]),
                dest=pdf_output, 
                encoding='utf-8'
            )

            if not os.path.exists(output_file):
                # Open a PDF file for writing in binary mode
                with open(output_file, "wb") as pdf_file:
                    # Write the PDF content to the file  
                    pdf_file.write(pdf_output.getvalue())
            else:
                print("File already exists: ", output_file)
        except:
            continue

Could not get image data from src attribute: https://i.stack.imgur.com/HfECj.png
'<img src="https://i.stack.imgur.com/HfECj.png" alt="project tree"/>'
Could not get image data from src attribute: https://i.stack.imgur.com/MFx0A.png
'<img src="https://i.stack.imgur.com/MFx0A.png" alt="enter image description here"/>'
Could not get image data from src attribute: https://i.stack.imgur.com/iuYly.png
'<img src="https://i.stack.imgur.com/iuYly.png" alt="enter image description here"/>'
Could not get image data from src attribute: https://i.stack.imgur.com/WG6C0.png
'<img src="https://i.stack.imgur.com/WG6C0.png" alt="enter image description here"/>'
Could not get image data from src attribute: https://i.stack.imgur.com/s0c3K.png
'<img src="https://i.stack.imgur.com/s0c3K.png" alt="project structure"/>'
Could not get image data from src attribute: https://i.stack.imgur.com/hudtk.png
'<img src="https://i.stack.imgur.com/hudtk.png" alt="enter image description here"/>'
Could not get image data 

In [19]:
for x in glob.glob("../../data/so_posts/**"):
    print(x, len(glob.glob(x + "/**")))

../../data/so_posts/docker-compose_maven 95
../../data/so_posts/docker-compose_spring-boot 92
../../data/so_posts/docker_docker-compose 95
../../data/so_posts/docker_maven 92
../../data/so_posts/docker_spring-boot 92
../../data/so_posts/spring-boot_maven 93


**Scrape Blog Posts**

In [None]:
import requests
import re
import backoff
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from xhtml2pdf import pisa
from io import BytesIO


# HTML content
html_content = """
<!DOCTYPE html>
<html>
<body>
    {}
</body>
</html>
"""


# List of technology pairs
technologies = ["spring-boot", "maven", "docker-compose", "docker"]
tech_pairs = list(itertools.combinations(technologies, r=2))

# Base URL for Google search
base_url = "https://www.bing.com/search?q=site:"
headers  = {'User-Agent':  UserAgent().chrome}


@backoff.on_exception(
    backoff.expo,
    requests.exceptions.RequestException,
    max_tries=8,
    jitter=None)
def get_urls(search_url: str):
    urls = []

    response = requests.get(search_url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    search_results = soup.find_all("h2")
    search_results = [result for result in search_results if "medium.com" in str(result)][:5]

    if not search_results:
        raise Exception()

    for elem in search_results:
        match = re.findall(r'"([^"]*)"', str(elem))
        link = match[1]
        urls.append(link)

    if not urls:
        raise Exception()
    
    return urls


# Iterate over each technology pair
for pair in tech_pairs:
    os.makedirs(f"../../data/blog_posts/{pair[0]}_{pair[1]}", exist_ok=True)

    print("Pair: ", pair)

    search_url = base_url + "medium.com" + " " + f"{pair[0]} {pair[1]}"

    urls = get_urls(search_url=search_url)

    print(urls)

    for url in urls:
        print(url)

        blog_post_name = "-".join(url.split("/")[-1].split("-")[0:-1])

        output_file = f"../../data/blog_posts/{pair[0]}_{pair[1]}/{blog_post_name}.pdf"

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        article = soup.find("article")
        
        pdf_output = BytesIO()
        pisa.CreatePDF(html_content.format(article), dest=pdf_output, encoding='utf-8')

        if not os.path.exists(output_file):
            # Open a PDF file for writing in binary mode
            with open(output_file, "wb") as pdf_file:
                # Write the PDF content to the file  
                pdf_file.write(pdf_output.getvalue())
        else:
            print("File already exists.")

In [None]:
for x in glob.glob("../../data/so_posts/**"):
     print(x)
     print(len(glob.glob(x + "/**")))
