In [43]:
#%pip install epub-metadata
#%pip install imagekitio

In [44]:
import epub_metadata
import base64
import os
import json
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv
from imagekitio import ImageKit
from imagekitio.models.UploadFileRequestOptions import UploadFileRequestOptions
from bs4 import BeautifulSoup
from time import sleep
load_dotenv("./.env")

True

In [45]:
imagekit = ImageKit(
    private_key=os.getenv('IMAGEKIT_PRIVATE_KEY'),
    public_key='public_8BuiQ9jYB1nLhO/S5XsdphxpnOY=',
    url_endpoint='https://ik.imagekit.io/boundlessbooks'
)

In [None]:
EPUB_IN_DIR = os.path.join(os.getcwd(), "data/in/epubs")
COVER_OUT_DIR = os.path.join(os.getcwd(), "data/out/covers")
METADATA_FILE_NAME = 'books_metadata'

In [47]:
def get_epub_files() -> list:
    all_files = os.listdir(EPUB_IN_DIR)
    epub_files = [os.path.join(EPUB_IN_DIR, f) for f in all_files if f.lower().endswith('.epub')]
    return epub_files

def get_metadata_from_epub(epub: epub_metadata.epub) -> dict:
    metadata = {
        "title": "",
        "author": "",
        "publisher": "",
        "category": "",
        "synopsis": "",
        "releaseDate": "1970/01/01",
        "pages": -1,
        "coverImageUrl": "",
        "epubUrl": ""
    }
    metadata["title"] = epub.metadata.title
    metadata["author"] = epub.metadata.creator
    metadata["publisher"] = epub.metadata.publisher

    description_html = epub.metadata.description
    soup = BeautifulSoup(description_html, 'html.parser')
    metadata["synopsis"] = soup.get_text()
    
    return metadata

def save_cover_image_from_epub(epub: epub_metadata.epub) -> str:
    cover_base64 = epub.metadata.cover
    cover_data = base64.b64decode(cover_base64)
    cover_image = Image.open(BytesIO(cover_data))
    cover_image_name = f"{epub.metadata.title} - {epub.metadata.creator}_cover.jpeg"
    cover_image.save(f'{COVER_OUT_DIR}/{cover_image_name}')

    print(f"Cover image saved at {COVER_OUT_DIR}/{cover_image_name}")
    
    return cover_image_name

def upload_cover_image_to_imagekit(cover_image_name: str) -> str:
    options = UploadFileRequestOptions(
        folder='/cover-images',
        is_private_file=False,
        overwrite_file=True,
        is_published=True
    )

    file = open(f'{COVER_OUT_DIR}/{cover_image_name}', 'rb')

    result = imagekit.upload_file(file=file, file_name='cover.jpg', options=options)

    print(f"Cover image uploaded to ImageKit at {result.url}")
    
    return result.url

def upload_epub_to_imagekit(epub_file: str) -> str:
    options = UploadFileRequestOptions(
        folder='/epubs',
        use_unique_file_name=True,
        is_private_file=False,
        overwrite_file=True,
        is_published=True
    )

    file = open(f'{epub_file}', 'rb')

    result = imagekit.upload_file(file=file, file_name='epub.epub', options=options)

    print(f"EPUB file uploaded to ImageKit at {result.url}")
    
    return result.url

def process_epub(epub_file: str) -> dict:
    epub = epub_metadata.epub(epub_file)
    metadata = get_metadata_from_epub(epub)
    cover_image_path = save_cover_image_from_epub(epub)
    cover_image_url = upload_cover_image_to_imagekit(cover_image_path)
    epub_file_url = upload_epub_to_imagekit(epub_file)
    metadata["coverImageUrl"] = cover_image_url
    metadata["epubUrl"] = epub_file_url
    return metadata

def process_epubs(epub_files: list) -> list:
    metadata_list = []
    for epub_file in epub_files:
        metadata = process_epub(epub_file)
        metadata_list.append(metadata)
        sleep(1)
    json.dump(metadata_list, open(f"data/out/metadata/{METADATA_FILE_NAME}.json", "w"), indent=4, ensure_ascii=False)
    return metadata_list

In [48]:
process_epubs(get_epub_files())

Cover image saved at /Users/rodrigo/Dev/infnet/extension/boundless-books/notebook/data/out/covers/Cinquenta Tons Mais Escuros - vol.2 - E.L. James_cover.jpeg
Cover image uploaded to ImageKit at https://ik.imagekit.io/boundlessbooks/cover-images/cover_yuwWgHj94.jpg
EPUB file uploaded to ImageKit at https://ik.imagekit.io/boundlessbooks/epubs/epub_MJaJnn3I98.epub
Cover image saved at /Users/rodrigo/Dev/infnet/extension/boundless-books/notebook/data/out/covers/Cinquenta Tons de Cinza - E L James_cover.jpeg
Cover image uploaded to ImageKit at https://ik.imagekit.io/boundlessbooks/cover-images/cover_kb2HVA-OO.jpg
EPUB file uploaded to ImageKit at https://ik.imagekit.io/boundlessbooks/epubs/epub_N3rAKXoY4.epub
Cover image saved at /Users/rodrigo/Dev/infnet/extension/boundless-books/notebook/data/out/covers/Cinquenta Tons De Liberdade - vol.3 - E.L. James_cover.jpeg
Cover image uploaded to ImageKit at https://ik.imagekit.io/boundlessbooks/cover-images/cover_hlNU0wXSD.jpg
EPUB file uploaded to

[{'title': 'Cinquenta Tons Mais Escuros - vol.2',
  'author': 'E.L. James',
  'publisher': 'Top Livros',
  'category': '',
  'synopsis': '',
  'releaseDate': '1970/01/01',
  'pages': -1,
  'coverImageUrl': 'https://ik.imagekit.io/boundlessbooks/cover-images/cover_yuwWgHj94.jpg',
  'epubUrl': 'https://ik.imagekit.io/boundlessbooks/epubs/epub_MJaJnn3I98.epub'},
 {'title': 'Cinquenta Tons de Cinza',
  'author': 'E L James',
  'publisher': 'LeLivros.com',
  'category': '',
  'synopsis': 'by\xa0LeLivros.comQuando Anastasia Steele entrevista o jovem empresário Christian Grey, descobre nele um homem atraente, brilhante e profundamente dominador. Ingênua e inocente, Ana se surpreende ao perceber que, a despeito da enigmática reserva de Grey, está desesperadamente atraída por ele. Incapaz de resistir à beleza discreta, à timidez e ao espírito independente de Ana, Grey admite que também a deseja - mas em seu próprios termos...',
  'releaseDate': '1970/01/01',
  'pages': -1,
  'coverImageUrl': 'h