In [1]:
from collections import defaultdict
from dataclasses import dataclass
import json
import requests
import gzip
import io
from tqdm import tqdm

response = requests.get("https://openlibrary.org/data/ol_dump_editions_latest.txt.gz", stream=True)
response.raise_for_status()

total_size = int(response.headers.get('content-length', 0))

@dataclass(frozen=True)
class CountClass:
  ocaid: bool
  toc: bool
  toc_paginated: bool
  toc_str: bool
  toc_empty: bool


counts = defaultdict(int)

with (
  tqdm(total=total_size, unit="B", unit_scale=True, desc="Processing") as pbar,
  gzip.GzipFile(fileobj=response.raw, mode='rb') as editions_gzip
):
    for line in io.TextIOWrapper(editions_gzip, encoding='utf-8'):
        data = json.loads(line.split('\t', 4)[-1])
        try:
          has_ocaid = "ocaid" in data
          has_toc = "table_of_contents" in data
          toc_str = False
          toc_paginated = False
          toc_empty = False
          if has_toc:
            toc_empty = len(data['table_of_contents']) == 0
            if not toc_empty:
              toc_str = isinstance(data['table_of_contents'][0], str)
              if not toc_str:
                toc_paginated = any(
                    chapter.get('pagenum')
                    for chapter in data['table_of_contents']
                )

          counts[CountClass(toc=has_toc, ocaid=has_ocaid, toc_paginated=toc_paginated, toc_str=toc_str, toc_empty=toc_empty)] += 1
        except:
          pbar.set_postfix_str(data['key'].split('/', 2)[-1])
          raise
        finally:
          pbar.n = editions_gzip.tell()  # Update the progress using the cursor position
          pbar.update(0)  # Refresh the progress bar display

counts

Processing:   2%|▏         | 242M/10.1G [00:34<23:09, 7.08MB/s]  


KeyboardInterrupt: 