## Import libraries

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from PIL import Image
import requests
import time
from io import BytesIO
from PyPDF2 import PdfMerger
import os
import json
from natsort import natsorted
import zipfile

## Data Variables

In [None]:
url = "https://weebcentral.com/series/01J76XY7FQY59WRK2YWX5T4E5N/Vinland-Saga"
anime_name = "Vinland Saga"
base_dir = "../data"
anime_path = os.path.join(base_dir, anime_name.lower().replace('.', '_'))

if not os.path.exists(anime_path):
    os.makedirs(anime_path)

## Image Link Scrape

In [None]:
driver = webdriver.Chrome()
driver.get(url)

In [None]:
button = driver.find_element(By.CSS_SELECTOR, 'button[hx-target="#chapter-list"]')
button.click()
time.sleep(5)

In [None]:
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
chapters_map = {}

for chapter in reversed(soup.find(id="chapter-list").find_all("a")):
    link = chapter['href']
    if not (link.startswith("https://weebcentral.com/chapters")):
        continue

    name = chapter.find_all("span")[2].text.strip()
    chapters_map[name] = {
        "link": link,
        "images": []
    }

with open(os.path.join(anime_path, "chapters.json"), "w", encoding="utf-8") as f:
    json.dump(chapters_map, f, ensure_ascii=False, indent=2)

In [None]:
def get_image_links(driver, chapter_link):
    driver.get(chapter_link)
    time.sleep(5)  # Wait for the page to load completely
    
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    
    title = soup.find("title").text.split(" | ")
    name, chapter = title[1], title[0].split()[1]

    image_section = soup.select_one('section[hx-include="[name=\'reading_style\']"]')
    image_tags = image_section.find_all("img")
    image_links = [img["src"] for img in image_tags]
    print(f"Found {len(image_links)} images for {name} - {chapter}")

    return name, chapter, image_links

In [None]:
try:
    with open(os.path.join(anime_path, "chapters.json"), "r", encoding="utf-8") as f:
        chapters_map = json.load(f)

    for chapter_name, chapter_info in chapters_map.items():
        chapter_link = chapter_info["link"]
        if len(chapter_info["images"]) > 0:
            print(f"Skipping {chapter_name}, already has {len(chapter_info['images'])} images.")
            continue
        name, chapter, image_links = get_image_links(driver, chapter_link)
        chapters_map[chapter_name]["images"] = image_links

finally:
    with open(os.path.join(anime_path, "chapters.json"), "w", encoding="utf-8") as f:
        json.dump(chapters_map, f, ensure_ascii=False, indent=2)

    driver.quit()

## Download Images in background using API server

In [None]:
def download_images(payload):
    res = requests.post("http://localhost:8080/download", json=payload)
    print(res.status_code)

In [None]:
with open(os.path.join(anime_path, "chapters.json"), "r", encoding="utf-8") as f:
    chapters_map = json.load(f)

    for chapter_name, chapter_info in chapters_map.items():
        print("Downloading images for chapter:", chapter_name, end="")
        image_links = chapter_info["images"]
        if len(image_links) > 0:
            payload = {
                "folder": f"{anime_name}/images/{chapter_name.replace(' ', '_').replace('.', '_')}",
                "image_links": image_links
            }
            download_images(payload)

## Generate zip file

In [None]:
def generate_cbz_files(img_folders_path, out_dir, name, batch_size=0):
    images, i = [], 1

    for folder in natsorted(os.scandir(img_folders_path), key=lambda e: e.name):
        if folder.is_dir():
            for file in natsorted(os.scandir(folder.path), key=lambda e: e.name):
                if file.is_file() and file.name.endswith(".png") or file.name.endswith(".jpg"):
                    images.append(file)

        if batch_size > 0 and len(images) > batch_size:
            with zipfile.ZipFile(os.path.join(out_dir, name + f" {i}.cbz"), "w", zipfile.ZIP_DEFLATED) as cbz:
                for image in images:
                    cbz.write(image.path, image.name)
            images, i = [], i + 1

    with zipfile.ZipFile(os.path.join(out_dir, name + f" {i}.cbz"), "w", zipfile.ZIP_DEFLATED) as cbz:
        for image in images:
            cbz.write(image.path, image.name)


In [None]:
generate_cbz_files(os.path.join(anime_path, "images"), anime_name, 0)

## Generate PDF

In [None]:
def generate_pdf_files(img_folders_path, out_dir, name, batch_size=0):
    images, i = [], 1

    for folder in natsorted(os.scandir(img_folders_path), key=lambda e: e.name):
        if folder.is_dir():
            for file in natsorted(os.scandir(folder.path), key=lambda e: e.name):
                if file.is_file() and file.name.endswith(".png") or file.name.endswith(".jpg"):
                    images.append(file)

        if batch_size > 0 and len(images) > batch_size:
            images[0].save(os.path.join(out_dir, name + f" {i}.pdf", save_all=True, append_images=images[1:]))
            images, i = [], i + 1

    if len(images) > 0:
        images[0].save(os.path.join(out_dir, name + f" {i}.pdf", save_all=True, append_images=images[1:]))

In [None]:
generate_pdf_files(os.path.join(anime_path, "images"), anime_name, 0)