In [6]:
import boto3
import json

def save_json_2s3(client, data, bucket_name, keystr):
    client.put_object(
        Bucket=bucket_name, Key=keystr, 
        Body=json.dumps(data), ContentType="application/json"
    )


def save_image_2s3(client, image_bytes, bucket_name, keystr):
    client.put_object(
        Bucket=bucket_name, Key=keystr, 
        Body=image_bytes, ContentType="image/png"
    )

def save_each2s3(data, client, bucket_name, prefix, file_name, split):
    save_json_2s3(
        client, data["label"], bucket_name,
        f"{prefix.rstrip('/')}/dataset/{mapping_split[split]}/labels/{file_name}.json",
    )
    save_image_2s3(
        client, data["image"], bucket_name,
        f"{prefix.rstrip('/')}/dataset/{mapping_split[split]}/images/{file_name}.png",
    )
    save_json_2s3(
        client, data["textract"], bucket_name,
        f"{prefix.rstrip('/')}/dataset/{mapping_split[split]}/textract/{file_name}.json",
    )






In [12]:
import boto3
import io

from datasets import load_dataset

mapping = {
  0: "letter",
  1: "form",
  2: "email",
  3: "handwritten",
  4: "advertisement",
  5: "scientific report",
  6: "scientific publication",
  7: "specification",
  8: "file folder",
  9: "news article",
  10: "budget",
  11: "invoice",
  12: "presentation",
  13: "questionnaire",
  14: "resume",
  15: "memo"
}

mapping_split = {
    "train": "training",
    "test": "validation"
}

session = boto3.Session(region_name="us-west-2")
textract_client = session.client("textract")
s3_client = session.client("s3")

buffer = io.BytesIO()


def textract_fn(image_bytes, client):
    return client.detect_document_text(Document={'Bytes': image_bytes})


def process_each(data, client, buffer):
    buffer.truncate(0)
    buffer.seek(0)
    data['image'].save(buffer, format="png")
    image_bytes = buffer.getvalue()
    label = {"label": mapping[int(data['label'])]}
    textract_json = textract_fn(image_bytes, client)
    return {
        "image": image_bytes, 
        "label": label, 
        "textract": textract_json
    }

def load_data_from_huggingface(split):
    return load_dataset("jordyvl/rvl_cdip_100_examples_per_class", split=split)

def metadata(client, data, bucket_name, prefix, split):
    meta_data = {
        'labels': mapping,
        'size': len(data),
        'name': 'RVLCDIP'
    }
    save_json_2s3(
        client, meta_data, bucket_name,
        f"{prefix.rstrip('/')}/dataset/{mapping_split[split]}/metadata.json",
    )



In [11]:
split = 'test'
data = load_data_from_huggingface(split)
bucket_name = 'udop-finetuning'
prefix = 'test-saving-data'
results = []
for idx, dt in enumerate(data):
    if idx == 2:
        break
    processed = process_each(dt, textract_client, buffer)
    save_each2s3(processed, s3_client, bucket_name, prefix, str(idx), split)
metadata(s3_client, data, bucket_name, prefix, split)


In [17]:
%%timeit
data[0]['image'].save(buffer, format="png")
image_bytes = buffer.getvalue()
textjson = textract_fn(image_bytes, textract_client)

2.18 s ± 280 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
