In [None]:
# !sudo apt-get install xvfb
# !pip install bs4 selenium pyvirtualdisplay PyPDF2 requests pillow
# !pip install playwright bs4 pillow requests pypdf2
# !playwright install
# !playwright install-deps

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from PIL import Image
import requests
import time
from io import BytesIO
from PyPDF2 import PdfMerger
import os
import json

In [2]:
driver = webdriver.Chrome()

In [2]:
url = "https://weebcentral.com/series/01J76XY7FQY59WRK2YWX5T4E5N/Vinland-Saga"
anime_name = "Vinland Saga"
json_path = f"../json/{anime_name.lower().replace(' ', '_')}_chapters.json"

In [4]:
driver.get(url)
button = driver.find_element(By.CSS_SELECTOR, 'button[hx-target="#chapter-list"]')
button.click()

time.sleep(5)

In [5]:
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')

In [6]:
chapters_map = {}

for chapter in reversed(soup.find(id="chapter-list").find_all("a")):
    link = chapter['href']
    if not (link.startswith("https://weebcentral.com/chapters")):
        continue

    name = chapter.find_all("span")[2].text.strip()
    chapters_map[name] = {
        "link": link,
        "images": []
    }

In [None]:
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(chapters_map, f, ensure_ascii=False, indent=2)

In [7]:
def get_image_links(driver, chapter_link):
    driver.get(chapter_link)
    time.sleep(5)  # Wait for the page to load completely
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    
    title = soup.find("title").text.split(" | ")
    name, chapter = title[1], title[0].split()[1]

    image_section = soup.select_one('section[hx-include="[name=\'reading_style\']"]')
    image_tags = image_section.find_all("img")
    image_links = [img["src"] for img in image_tags]
    print(f"Found {len(image_links)} images for {name} - {chapter}")

    return name, chapter, image_links

In [10]:
try:
    with open(json_path, "r", encoding="utf-8") as f:
        chapters_map = json.load(f)

        for chapter_name, chapter_info in chapters_map.items():
            chapter_link = chapter_info["link"]
            if len(chapter_info["images"]) > 0:
                print(f"Skipping {chapter_name}, already has {len(chapter_info['images'])} images.")
                continue
            name, chapter, image_links = get_image_links(driver, chapter_link)
            chapters_map[chapter_name]["images"] = image_links

finally:
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(chapters_map, f, ensure_ascii=False, indent=2)

Skipping Chapter 1, already has 88 images.
Skipping Chapter 2, already has 42 images.
Skipping Chapter 3, already has 37 images.
Skipping Chapter 4, already has 26 images.
Skipping Chapter 5, already has 25 images.
Skipping Chapter 6, already has 23 images.
Skipping Chapter 7, already has 20 images.
Skipping Chapter 8, already has 21 images.
Skipping Chapter 9, already has 20 images.
Skipping Chapter 10, already has 20 images.
Skipping Chapter 11, already has 20 images.
Skipping Chapter 12, already has 20 images.
Skipping Chapter 13, already has 19 images.
Skipping Chapter 14, already has 19 images.
Skipping Chapter 15, already has 19 images.
Skipping Chapter 16, already has 41 images.
Skipping Chapter 17, already has 49 images.
Skipping Chapter 18, already has 39 images.
Skipping Chapter 19, already has 39 images.
Skipping Chapter 20, already has 38 images.
Skipping Chapter 21, already has 34 images.
Skipping Chapter 21.5, already has 17 images.
Skipping Chapter 22, already has 36 ima

In [9]:
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(chapters_map, f, ensure_ascii=False, indent=2)

In [3]:
def download_images(payload):
    res = requests.post("http://localhost:8080/download", json=payload)
    print(res.status_code)

In [5]:
with open(json_path, "r", encoding="utf-8") as f:
    chapters_map = json.load(f)

    for chapter_name, chapter_info in chapters_map.items():
        print("Downloading images for chapter:", chapter_name)
        image_links = chapter_info["images"]
        if len(image_links) > 0:
            payload = {
                "folder": f"{anime_name}/{chapter_name.replace(' ', '_').replace('.', '_')}",
                "image_links": image_links
            }
            download_images(payload)

Downloading images for chapter: Chapter 1
202
Downloading images for chapter: Chapter 2
202
Downloading images for chapter: Chapter 3
202
Downloading images for chapter: Chapter 4
202
Downloading images for chapter: Chapter 5
202
Downloading images for chapter: Chapter 6
202
Downloading images for chapter: Chapter 7
202
Downloading images for chapter: Chapter 8
202
Downloading images for chapter: Chapter 9
202
Downloading images for chapter: Chapter 10
202
Downloading images for chapter: Chapter 11
202
Downloading images for chapter: Chapter 12
202
Downloading images for chapter: Chapter 13
202
Downloading images for chapter: Chapter 14
202
Downloading images for chapter: Chapter 15
202
Downloading images for chapter: Chapter 16
202
Downloading images for chapter: Chapter 17
202
Downloading images for chapter: Chapter 18
202
Downloading images for chapter: Chapter 19
202
Downloading images for chapter: Chapter 20
202
Downloading images for chapter: Chapter 21
202
Downloading images for

In [None]:
images = []

for chapter_folder in sorted(os.listdir(anime_name)):
    chapter_path = os.path.join(anime_name, chapter_folder)
    
    if os.path.isdir(chapter_path):
        for image_file in sorted(os.listdir(chapter_path)):
            image_path = os.path.join(chapter_path, image_file)
            images.append(Image.open(image_path).convert("RGB"))

In [None]:
if images:
    pdf_path = f"../pdfs/{anime_name}.pdf"
    images[0].save(pdf_path, save_all=True, append_images=images[1:])
    print(f"Saved PDF: {pdf_path}")

In [11]:
driver.quit()

In [None]:
import json

with open(json_path, "r", encoding="utf-8") as f:
    chapters_map = json.load(f)
    total_images = 0
    
    for chapter_name, chapter_info in chapters_map.items():
        # print(f"{chapter_name}: {len(chapter_info['images'])} images")
        total_images += len(chapter_info['images'])
        
    print(f"Total images: {total_images}")
    print(f"Total chapters: {len(chapters_map)}")