# Login to HuggingFace (just login once)

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

# Collect Menu Image Datasets
- Use metadata.jsonl to store the metadata of the images.
- [Google AI Studio](https://aistudio.google.com) or [OpenAI ChatGPT](https://chatgpt.com) by this prompt
- Gemini API by doing the function calling. Start the gradio app and input image and access token to test it.

In [None]:
from datasets import load_dataset

dataset = load_dataset(path="datasets/menu-zh-TW")
dataset.push_to_hub(repo_id="ryanlinjui/menu-zh-TW")

# Setup for Fine-tuning

In [None]:
from datasets import load_dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
from menu.donut import DonutDatasets

DATASETS_REPO_ID = "ryanlinjui/menu-zh-TW"
PRETRAINED_MODEL_REPO_ID = "naver-clova-ix/donut-base"
TASK_PROMPT_NAME = "<s_menu>"
MAX_LENGTH = 768
IMAGE_SIZE = [1280, 960]

raw_datasets = load_dataset(DATASETS_REPO_ID)

# Config
config = VisionEncoderDecoderConfig.from_pretrained(PRETRAINED_MODEL_REPO_ID)
config.encoder.image_size = IMAGE_SIZE
config.decoder.max_length = MAX_LENGTH

# Processor
processor = DonutProcessor.from_pretrained(PRETRAINED_MODEL_REPO_ID)
processor.feature_extractor.size = IMAGE_SIZE[::-1]
processor.feature_extractor.do_align_long_axis = False

# DonDatasets
datasets = DonutDatasets(
    datasets=raw_datasets,
    processor=processor,
    image_column="image",
    annotation_column="menu",
    task_start_token=TASK_PROMPT_NAME,
    prompt_end_token=TASK_PROMPT_NAME,
    train_split=0.8,
    validation_split=0.1,
    test_split=0.1
)

# Model
model = VisionEncoderDecoderModel.from_pretrained(PRETRAINED_MODEL_REPO_ID, config=config)
model.decoder.resize_token_embeddings(len(processor.tokenizer))
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids([TASK_PROMPT_NAME])[0]

In [None]:
print("Pad token ID:", processor.decode([model.config.pad_token_id]))
print("Decoder start token ID:", processor.decode([model.config.decoder_start_token_id]))

# Start Fine-tuning

In [None]:
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

EPOCHS = 30
LEARNING_RATE = 3e-5
TRAIN_BATCH_SIZE = 1
EVALUATION_BATCH_SIZE = 1
CHECKPOINT_PATH = "./.checkpoints"
SEED = 2022
WARMUP_STEPS = 30
MAX_STEPS = -1

if torch.cuda.is_available():
    print("Using GPU")
    model.to("cuda")
else:
    print("Using default device")

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVALUATION_BATCH_SIZE,
    output_dir=CHECKPOINT_PATH,
    seed=SEED,
    warmup_steps=WARMUP_STEPS,
    max_steps=MAX_STEPS,
    push_to_hub=True,
    hub_model_id="donut-base-finetuned-menu"
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"]
)
trainer.train()
'''
donut-base-finetuned-menu
{"file_name":"94.jpg","ground_truth":{"gt_parse":{"restaurant":"安田麵屋","address":"","phone":"","business_hours":"","items":[{"name":"豚骨拉麵","price":"120"},{"name":"明太子烏龍麵","price":"100"}]}}}
resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
result_path: "./result"
pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
dataset_name_or_paths: ["ryanlinjui/donut-menu-zh-TW"] # loading datasets (from moldehub or path)
sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
train_batch_sizes: [4]
val_batch_sizes: [1]
input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
max_length: 768
align_long_axis: False
num_nodes: 1
seed: 2022
lr: 3e-5
warmup_steps: 30 # 800/8*30/10, 10%
num_training_samples_per_epoch: 80
max_epochs: 30
max_steps: -1
num_workers: 8
val_check_interval: 1.0
check_val_every_n_epoch: 1
gradient_clip_val: 1.0
verbose: True
'''

# Plot the results

In [None]:
# test model
import re

from transformers import VisionEncoderDecoderModel
from PIL import Image

image = Image.open("/root/menu-text-detection/examples/menu-hd.jpg").convert("RGB")

model = VisionEncoderDecoderModel.from_pretrained(".checkpoint/checkpoint-2000")
device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

pixel_values = processor(image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)

task_prompt = "<s_menu>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
outputs = model.generate(
    pixel_values,
    decoder_input_ids=decoder_input_ids,
    max_length=model.decoder.config.max_position_embeddings,
    early_stopping=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    num_beams=1,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
seq = processor.token2json(seq)
print(seq)
