<a href="https://colab.research.google.com/github/rahim215/rahimhalani/blob/main/comicAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install aiohttp
!pip install nest_asyncio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiosignal>=1.

In [2]:
import aiohttp
import numpy as np
import cv2
import os
from google.colab import drive

drive.mount('/content/drive')

async def download_comic_books(comic_urls):
    for comic_url in comic_urls:
        # Extract the comic name and chapter name from the URL
        comic_name = comic_url.split('/')[-3]
        chapter_name = comic_url.split('chapters/')[1].split('/')[0]

        # Create the directory structure for the comic book if it doesn't exist
        comic_path = os.path.join('/content/drive/MyDrive/Colab Notebooks', 'comics', comic_name, chapter_name)
        if not os.path.exists(comic_path):
            os.makedirs(comic_path)

        async with aiohttp.ClientSession() as session:
            # Iterate over the pages of the comic book
            page_num = 1
            while True:
                # Construct the URL of the comic book page image
                page_url = f"{comic_url}/{page_num:02d}.jpg"
                print(page_url)

                # Download the comic book page image from the URL
                print(f"Downloading image from {page_url}...")
                async with session.get(page_url) as response:
                    # Check if the response is a 404 error
                    if response.status == 404:
                        break

                    image_array = np.asarray(bytearray(await response.read()), dtype=np.uint8)

                    # Decode the image array using OpenCV
                    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

                    # Convert the image to grayscale
                    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

                    # Apply a threshold to convert the image to binary
                    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

                    # Find contours in the image
                    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

                    # Create a directory for the current page if it doesn't exist
                    page_path = os.path.join(comic_path, f"page{page_num:02d}")
                    if not os.path.exists(page_path):
                        os.makedirs(page_path)

                    # Iterate over the contours to crop and save each comic panel as a separate image
                    for i, contour in enumerate(contours):
                        # Get the area of the contour
                        area = cv2.contourArea(contour)

                        # Only process the contour if its area is greater than a certain threshold value
                        if area > 5000:
                            # Get the bounding rectangle of the contour
                            x, y, w, h = cv2.boundingRect(contour)

                            # Crop the comic panel from the image
                            panel = image[y:y+h, x:x+w]

                            # Save the panel as a separate image in the current page directory
                            panel_path = os.path.join(page_path, f"panel{i+1:02d}.jpg")
                            cv2.imwrite(panel_path, panel)

                    # Save the complete page image in the page directory
                    page_image_path = os.path.join(page_path, f"page{page_num:02d}.jpg")
                    cv2.imwrite(page_image_path, image)

                    page_num += 1

    print("Done!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import aiohttp
import asyncio
import cv2
import numpy as np
import os
import requests
from bs4 import BeautifulSoup

async def get_chapter_urls(comic_url):
    # Send a GET request to the comic URL and extract the HTML content
    response = requests.get(comic_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the chapter links on the page
    chapter_links = soup.find_all('h5', {'class': 'chapter-title-rtl'})

    # Create a list to store the chapter URLs
    chapter_urls = []

    # Loop over each chapter link and extract the chapter URL
    for chapter_link in chapter_links:
        a_tag = chapter_link.find('a')
        chapter_url = a_tag['href']
        chapter_urls.append(chapter_url)

    # Construct the base URL for the comic book
    base_url = comic_url.replace('https://readcomicsonline.ru/comic/', 'https://readcomicsonline.ru/uploads/manga/')

    # Loop over each chapter URL and construct the full chapter URL
    for i, chapter_url in enumerate(chapter_urls):
        chapter_name = chapter_url.split('/')[-1]
        chapter_urls[i] = f"{base_url}/chapters/{chapter_name}"

    return chapter_urls


In [4]:
import asyncio

async def download_comic_book_from_url(comic_url):
    # Retrieve the chapter URLs for the comic book
    chapter_urls = await get_chapter_urls(comic_url)

    # Download the comic book pages for each chapter
    await download_comic_books(chapter_urls)

# Example usage
comic_url = "https://readcomicsonline.ru/comic/2020-force-works-2020"
await download_comic_book_from_url(comic_url)


https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/01.jpg
Downloading image from https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/01.jpg...
https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/02.jpg
Downloading image from https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/02.jpg...
https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/03.jpg
Downloading image from https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/03.jpg...
https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/04.jpg
Downloading image from https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/04.jpg...
https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/05.jpg
Downloading image from https://readcomicsonline.ru/uploads/manga/2020-force-works-2020/chapters/3/05.jpg...
https://readcomicsonline.ru/uploads/manga/2020-for