In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import concurrent.futures
import json
from pathlib import Path

from tqdm.notebook import tqdm

from docile.dataset import Dataset


def get_page_sizes(dataset: Dataset):
    docid_to_page_sizes = {}
    with tqdm(total=dataset.total_page_count(), desc=f"Generating images for {dataset}", disable=True) as pbar:
        for doc in dataset:
            docid_to_page_sizes[doc.docid] = []
            for page in range(doc.page_count):
                docid_to_page_sizes[doc.docid].append(doc.page_image(page).size)
                pbar.update()
    return docid_to_page_sizes

def store_page_sizes(dataset: Dataset):
    page_sizes_path = (dataset.data_paths.dataset_path.full_path / "additional_resources" / "page_sizes" / dataset.split_name).with_suffix(".json")
    if page_sizes_path.exists():
        return
    page_sizes = get_page_sizes(dataset)
    page_sizes_path.parent.mkdir(exist_ok=True)
    page_sizes_path.write_text(json.dumps(page_sizes, indent=2))
    
def store_page_sizes_parallel(dataset: Dataset, pages_per_chunk: int, processes: int) -> None:
    chunks_num = sum(1 for _ in tqdm(dataset.chunk(max_pages_per_chunk=pages_per_chunk), desc="Counting chunks"))
    with concurrent.futures.ProcessPoolExecutor(processes) as executor:
        
        def submit_new_job(dataset_chunks_it):
            try:
                return {executor.submit(store_page_sizes, next(dataset_chunks_it))}
            except StopIteration:
                return {}
        
        dataset_chunks_it = iter(
            tqdm(
                dataset.chunk(max_pages_per_chunk=pages_per_chunk),
                total=chunks_num,
                desc="Chunks submitted",
                position=0,
            )
        )
        with tqdm(total=chunks_num, desc="Page sizes computed for chunks", position=1) as pbar:
            not_done = set()
            for _ in range(2 * processes):
                not_done.update(submit_new_job(dataset_chunks_it))
            while not_done:
                done, not_done = concurrent.futures.wait(
                    not_done,
                    return_when=concurrent.futures.FIRST_COMPLETED,
                )
                for future in done:
                    try:
                        future.result()
                    except Exception as e:
                        for not_done_future in not_done:
                            not_done_future.cancel()
                        raise e
                    not_done.update(submit_new_job(dataset_chunks_it))
                pbar.update(len(done))

In [3]:
from docile.dataset import CachingConfig, Dataset

docile_final = Dataset("all", "/datasets/docile221221-0", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)

In [4]:
store_page_sizes_parallel(docile_final, pages_per_chunk=1000, processes=10)

Counting chunks: 0it [00:00, ?it/s]

Chunks submitted:   0%|          | 0/9 [00:00<?, ?it/s]

Page sizes computed for chunks:   0%|          | 0/9 [00:00<?, ?it/s]

In [3]:
from docile.dataset import CachingConfig, Dataset

docile_synthetic = Dataset("synthetic", "/datasets/docile221221-0", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)

Loading documents for docile221221-0:synthetic: 100%|██████████| 100000/100000 [00:04<00:00, 24257.27it/s]


In [4]:
store_page_sizes_parallel(docile_synthetic, pages_per_chunk=1000, processes=10)

Counting chunks: 0it [00:00, ?it/s]

Chunks submitted:   0%|          | 0/100 [00:00<?, ?it/s]

Page sizes computed for chunks:   0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
from docile.dataset import CachingConfig, Dataset

docile_pretraining = Dataset("pretraining-all", "/datasets/docile_pretraining_v1_2022_12_22", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)

Loading documents for docile_pretraining_v1_2022_12_22:pretraining-all: 100%|██████████| 932467/932467 [00:34<00:00, 26656.33it/s]


In [6]:
store_page_sizes_parallel(docile_pretraining, pages_per_chunk=1000, processes=10)

Counting chunks: 0it [00:00, ?it/s]

Chunks submitted:   0%|          | 0/3431 [00:00<?, ?it/s]

Page sizes computed for chunks:   0%|          | 0/3431 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Profile dataset creation (when loading annotations and ocr is off)

In [7]:
assert False, "do not run automatically under here"

AssertionError: do not run automatically under here

In [None]:
import cProfile

In [None]:
from docile.dataset import CachingConfig, Dataset

cProfile.run('Dataset("synthetic", "/datasets/docile221221-0", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)')

In [None]:
from docile.dataset import CachingConfig, Dataset

cProfile.run('Dataset("synthetic", "/datasets/docile221221-0", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)')

In [None]:
cProfile.run('Dataset("synthetic", "/datasets/docile221221-0", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)')

In [None]:
cProfile.run('Dataset("pretraining-chunk-00", "/datasets/docile_pretraining_v1_2022_12_22", load_annotations=False, load_ocr=False, cache_images=CachingConfig.OFF)')

In [None]:
chunks_num = sum(1 for _ in tqdm(docile_pretraining.chunk(max_pages_per_chunk=1000)))

In [None]:
def get_page_sizes_parallel(dataset: Dataset, pages_per_chunk: int, processes: int) -> None:
    chunks_num

In [None]:
get_page_sizes_for_chunk(next(docile_pretraining.chunk(max_pages_per_chunk=1000)))

In [None]:
from tqdm.notebook import tqdm

sum(1 for _ in tqdm(docile_pretraining.chunk(max_pages_per_chunk=100000)))

In [None]:
docile_final_sample = docile_final.sample(15, seed=47).load()

In [None]:
[doc._open for doc in docile_final[:5]]

In [None]:
[doc._open for doc in list(docile_final[:5])]

In [None]:
from docile.dataset.document_images import DocumentImages

for doc in docile_final_sample:
    img_original_images = DocumentImages(path=Path("/datasets/docile221221-0") / "cached_images" / doc.docid, pdf_path=doc.data_paths.pdf_path(doc.docid), page_count = doc.page_count, dpi=200)
    old_sizes = [img.size for img in img_original_images.content]
    new_sizes = []
    for page in range(doc.page_count):
        new_sizes.append(doc.page_image(page, dpi=200).size)
    assert old_sizes == new_sizes

In [None]:
docid_to_page_to_img_size_pretraining = {}
sample = docile_pretraining.sample(200, seed=32580)
with tqdm(total=sample.total_page_count()) as pbar:
    for doc in sample:
        docid_to_page_to_img_size_pretraining[doc.docid] = {}
        for page in range(doc.page_count):
            docid_to_page_to_img_size_pretraining[doc.docid][page] = doc.page_image(page).size
            pbar.update()
        # if any(docid_to_page_to_img_size_pretraining[doc.docid][page] != docid_to_page_to_img_size_pretraining[doc.docid][0] for page in range(doc.page_count)):
        #     print(docid_to_page_to_img_size_pretraining[doc.docid])

In [None]:
from pathlib import Path

from PIL import Image
from tqdm import tqdm

cached_images = Path("/datasets/docile221221-0/cached_images")
docid_to_page_to_img_size = {}
for docid in tqdm(list(cached_images.iterdir())):
    docid_to_page_to_img_size[docid.stem] = {}
    for page in docid.iterdir():
        docid_to_page_to_img_size[docid.stem][page.stem] = Image.open(page).size
    if len(list(docid.iterdir())) == 0:
        print(list(docid.iterdir()))
    if any(docid_to_page_to_img_size[docid.stem][page.stem] != docid_to_page_to_img_size[docid.stem]["0"] for page in docid.iterdir()):
        print(docid_to_page_to_img_size[docid.stem])

In [None]:
docid_to_page_to_img_size

In [None]:
dpi / 200 * image_size_at_200

In [None]:
doc = docile_final_sample[0]

In [None]:
doc.page_image(0).size

In [None]:
from docile.dataset.document_images import DocumentImages

img_original_images = DocumentImages(path=Path("/datasets/docile221221-0") / "cached_images" / doc.docid, pdf_path=doc.data_paths.pdf_path(doc.docid), page_count = doc.page_count, size=(None, None))

In [None]:
img_200dpi = doc.page_image(0)

In [None]:
img_200dpi == img_original_images.content[0]

In [None]:
img_144dpi = doc.page_image(0, dpi=144)

In [None]:
img_144dpi.size

In [None]:
img_200dpi_1218x1616 = doc.page_image(0, dpi=200, image_size=(1218,1616))

In [None]:
img_144dpi == img_200dpi_1218x1616

In [None]:
img_1000dpi_1218x1616 = doc.page_image(0, dpi=1000, image_size=(1218,1616))

In [None]:
img_144dpi == img_1000dpi_1218x1616

In [None]:
img_100dpi_1218x1616 = doc.page_image(0, dpi=100, image_size=(1218,1616))

In [None]:
img_144dpi == img_100dpi_1218x1616

In [None]:
img_10dpi_1218x1616 = doc.page_image(0, dpi=10, image_size=(1218,1616))

In [None]:
img_144dpi == img_10dpi_1218x1616

In [None]:
doc.page_image(0, dpi=50).size

In [None]:
doc.page_image(0, dpi=50).size

In [None]:
doc.page_image(0, image_size=(423, 562)).size

In [None]:
doc.page_image(0, dpi=50) == doc.page_image(0, image_size=(423, 562))

In [None]:
doc.page_image(0, dpi=144) == doc.page_image(0, dpi=50, image_size=(1218,1616))

In [None]:
import numpy as np

(np.array(doc.page_image(0, dpi=10)) == np.array(doc.page_image(0, image_size=(85, 113)))).all(axis=2).sum()

In [None]:
85*113

In [None]:
from PIL import Image

doc.page_image(0, dpi=10).resize((85*10, 113*10), resample=Image.NEAREST)

In [None]:
from PIL import Image

doc.page_image(0, image_size=(85, 113)).resize((85*10, 113*10), resample=Image.NEAREST)

In [None]:
doc.page_image(0, image_size=(85, 113))

In [None]:
img_200 = docile_final[0].page_image(0, dpi=200, size=(1653, 2339))

In [None]:
img_default = docile_final[0].page_image(0, size=(1653, 2339))i

In [None]:
docile_final[0].page_image_size(0)

In [None]:
docile_final[0]

In [None]:
for doc in docile_final[:10]:
    print(doc)
    sz = doc.page_image(0).size
    assert abs(sz[0] / sz[1] - doc.annotation.page_aspect_ratio(0)) < 1e-6
    print(f"200 DPI: {sz}")
    sz_72 = (sz[0] * 72/200, sz[1] * 72/200)
    print(f"72 DPI: {sz_72}")
    print(doc.data_paths.pdf_path(doc.docid).full_path)
    !pdfinfo {doc.data_paths.pdf_path(doc.docid).full_path}

In [None]:
%debug

In [None]:
634.699 * 200/72

In [None]:
842 * 200/72