In [1]:
import torch

project_name = "receiptlayoutlm"

In [2]:
from datasets import load_from_disk, load_dataset

ds_receipts = load_dataset("sibrun/receipts", use_auth_token=True)
ds_receipts['train'].features

Using custom data configuration sibrun--receipts-91ec3f378365ecec
Reusing dataset parquet (/Users/simon/.cache/huggingface/datasets/parquet/sibrun--receipts-91ec3f378365ecec/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [3]:
from datasets import Features, Array2D, Array3D, Sequence, Value
max_length = 512
features = Features({
    'image': Array3D(dtype="uint8", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype="int32"), length=max_length),
    'bbox': Array2D(dtype="int64", shape=(max_length, 4)),
    'labels': Sequence(feature=Value(dtype="int64"), length=max_length),
    'attention_mask': Sequence(feature=Value(dtype="int8"), length=max_length),
})

In [4]:
ds_receipts = ds_receipts.cast(features)
ds_receipts['train'].features

Loading cached processed dataset at /Users/simon/.cache/huggingface/datasets/parquet/sibrun--receipts-91ec3f378365ecec/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-d55ae3daf417afbb.arrow
Loading cached processed dataset at /Users/simon/.cache/huggingface/datasets/parquet/sibrun--receipts-91ec3f378365ecec/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-ea4a0133cea9984f.arrow


{'image': Array3D(shape=(3, 224, 224), dtype='uint8', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=512, id=None),
 'bbox': Array2D(shape=(512, 4), dtype='int64', id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=512, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=512, id=None)}

In [5]:
label_names = ['company', 'date', 'address', 'total']
labels = ['O'] + label_names
num_labels = len(labels)
ids_to_labels = {k: v for k, v in enumerate(labels)}
labels_to_ids = {v: k for k, v in enumerate(labels)}

In [6]:
from transformers import AutoConfig

xlm_config = AutoConfig.from_pretrained("microsoft/layoutxlm-base",
                                         num_labels=num_labels,
                                         id2label=ids_to_labels,
                                         label2id=labels_to_ids)

In [7]:
from transformers import LayoutLMv2ForTokenClassification

model_xlm = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutxlm-base")

Some weights of the model checkpoint at microsoft/layoutxlm-base were not used when initializing LayoutLMv2ForTokenClassification: ['layoutlmv2.visual.backbone.bottom_up.res4.21.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.0.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.1.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.13.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.7.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.20.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.0.shortcut.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.9.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.14.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.1.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.2.conv2.norm.num_batches_tracked', 'la

In [8]:
model_xlm.save_pretrained("../models/token_class_xlm_base")

In [9]:
model_xlm = LayoutLMv2ForTokenClassification.from_pretrained("../models/token_class_xlm_base")

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ds_receipts.set_format(type="torch", device=device)
model_xlm.to(device)

LayoutLMv2ForTokenClassification(
  (layoutlmv2): LayoutLMv2Model(
    (embeddings): LayoutLMv2Embeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (visual): LayoutLMv2VisualBackbone(
      (backbone): FPN(
        (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
        (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
 

In [9]:
from argparse import Namespace

# Commented parameters correspond to the small model
config = {"train_batch_size": 4,
          "valid_batch_size": 2,
          "weight_decay": 0.1,
          "learning_rate": 5e-5,
          "num_train_epochs": 2,
          "seed": 1,
          "save_checkpoint_steps": 100}

args = Namespace(**config)

In [10]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(ds_receipts['train'], batch_size=args.train_batch_size, shuffle=True)
eval_dataloader = DataLoader(ds_receipts['test'], batch_size=args.valid_batch_size)

In [11]:
batch = next(iter(train_dataloader))
for k,v in batch.items():
  print(k, v.shape)

image torch.Size([4, 3, 224, 224])
input_ids torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
labels torch.Size([4, 512])
attention_mask torch.Size([4, 512])


In [13]:
optimizer = torch.optim.AdamW(model_xlm.parameters(), lr=args.learning_rate)

In [14]:
from torch.utils.tensorboard import SummaryWriter
import logging
import wandb
import transformers
import datasets

logging_file_path = "../log/train.log"

def setup_logging(project_name):
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
        logging.FileHandler(logging_file_path),
        logging.StreamHandler()])
    #wandb.init(project=project_name, config=args)
    #run_name = wandb.run.name
    tb_writer = SummaryWriter()
    tb_writer.add_hparams(vars(args), {'0': 0})
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_debug()
    transformers.utils.logging.set_verbosity_info()
    return logger, tb_writer #run_name

In [15]:
def log_metrics(step, metrics, logger, tb_writer):
    logger.info(f"Step {step}: {metrics}")
    #wandb.log(metrics)
    [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

In [16]:
def evaluate():
    model_xlm.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            model_outputs = model_xlm(**batch)
        losses.append(model_outputs.losse)
    total_loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(total_loss)
    except OverflowError:
            perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

In [None]:
from random import seed

seed(args.seed)
logger, tb_writer = setup_logging(project_name)  #, run_name

In [19]:
step = 0
model_xlm.train()
for epoch in range(args.num_train_epochs):
   print("Epoch:", epoch)
   for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model_xlm(**batch)
        loss = outputs.loss
        log_metrics(step, {'steps': step, 'loss/train': loss.item()}, logger, tb_writer)
        if step % 10 == 0:
          print(f"Loss after {step} steps: {loss.item()}")
        if step % args.save_checkpoint_steps == 0:
            logger.info('Evaluating and saving model checkpoint')
            eval_loss, perplexity = evaluate()
            log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity}, logger, tb_writer)
            model_xlm.save_pretrained("../models/receiptlayoutlm")
            model_xlm.push_to_hub("sibrun/receiptlayoutlm", commit_message=f'step {step}')
        model_xlm.train()
        loss.backward()
        optimizer.step()
        step += 1

logger.info('Evaluating and saving model after training')
eval_loss, perplexity = evaluate()
log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
model_xlm.save_pretrained("../models/receiptlayoutlm")
model_xlm.push_to_hub("sibrun/receiptlayoutlm", commit_message='final model')

Error in callback <function _WandbInit._resume_backend at 0x7fd9c036a9d0> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Epoch: 0


NameError: name 'optimizer' is not defined

Error in callback <function _WandbInit._pause_backend at 0x7fd9c036aca0> (for post_run_cell):


Exception: The wandb backend process has shutdown