<a href="https://colab.research.google.com/github/uakarsh/docformer/blob/master/examples/docformer_pl/token_cls/Token_Classification_Part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Installing the dependencies (might take some time)

%%capture
!pip install pytesseract
!sudo apt install tesseract-ocr
!pip install transformers
!pip install pytorch-lightning
!pip install einops
!pip install tqdm
!pip install 'Pillow==7.1.2'
!pip install datasets

In [2]:
!git clone https://github.com/shabie/docformer.git

Cloning into 'docformer'...
remote: Enumerating objects: 1241, done.[K
remote: Counting objects: 100% (193/193), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 1241 (delta 149), reused 47 (delta 45), pack-reused 1048[K
Receiving objects: 100% (1241/1241), 4.29 MiB | 7.70 MiB/s, done.
Resolving deltas: 100% (662/662), done.


In [3]:
from datasets import load_dataset
from PIL import Image
import numpy as np

## Importing the libraries

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

import os
import numpy as np
import pandas as pd
from PIL import Image,ImageDraw

import torch
from torchvision.transforms import ToTensor
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

import math
import torch.nn.functional as F
from torch import Tensor

## Adding the path of docformer to system path
import sys
sys.path.append('/content/docformer/src/docformer/')

## Importing the functions from the DocFormer Repo
from modeling import DocFormerEncoder,ResNetFeatureExtractor,DocFormerEmbeddings,LanguageFeatureExtractor
from transformers import BertTokenizerFast

In [4]:
dataset = load_dataset("nielsr/FUNSD_layoutlmv2")

Downloading builder script:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

Downloading and preparing dataset funsd_layoutlmv2/funsd to /root/.cache/huggingface/datasets/nielsr___funsd_layoutlmv2/funsd/1.0.0/2eb102a1694e5d3cb31c28fa09330c004f9654f479c639b4603dc6f85e8d0141...


Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset funsd_layoutlmv2 downloaded and prepared to /root/.cache/huggingface/datasets/nielsr___funsd_layoutlmv2/funsd/1.0.0/2eb102a1694e5d3cb31c28fa09330c004f9654f479c639b4603dc6f85e8d0141. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_ds = dataset['train']
val_ds = dataset['test']

## Generating the dataset appropriate to the DocFormer Input

In [6]:
## Same as here: https://github.com/shabie/docformer/blob/master/src/docformer/dataset.py

def normalize_box(box, width, height, size=1000):
    """
    Takes a bounding box and normalizes it to a thousand pixels. If you notice it is
    just like calculating percentage except takes 1000 instead of 100.
    """
    return [
        int(size * (box[0] / width)),
        int(size * (box[1] / height)),
        int(size * (box[2] / width)),
        int(size * (box[3] / height)),
    ]

def resize_align_bbox(bbox, orig_w, orig_h, target_w, target_h):
    x_scale = target_w / orig_w
    y_scale = target_h / orig_h
    orig_left, orig_top, orig_right, orig_bottom = bbox
    x = int(np.round(orig_left * x_scale))
    y = int(np.round(orig_top * y_scale))
    xmax = int(np.round(orig_right * x_scale))
    ymax = int(np.round(orig_bottom * y_scale))
    return [x, y, xmax, ymax]


def get_tokens_with_boxes(unnormalized_word_boxes, list_of_words, token_label, tokenizer, pad_token_id = 0, pad_token_box = [0, 0, 0, 0], max_seq_len = 512, pad_token_class = 7):
    
    '''
    This function returns two items:
    1. unnormalized_token_boxes -> a list of len = max_seq_len, containing the boxes corresponding to the tokenized words, 
                                    one box might repeat as per the tokenization procedure
    2. tokenized_words -> tokenized words corresponding to the tokenizer and the list_of_words
    '''

    assert len(unnormalized_word_boxes) == len(list_of_words) == len(token_label), f"Length of Bounding box: {len(unnormalized_word_boxes)}, words: {len(list_of_words)}, token: {len(token_label)}"
    
    length_of_box = len(unnormalized_word_boxes)
    unnormalized_token_boxes = []
    tokenized_words = []
    final_token_label = []

    for box, word, token in zip(unnormalized_word_boxes, list_of_words, token_label):
      current_tokens = tokenizer(word, add_special_tokens = False).input_ids
      unnormalized_token_boxes.extend([box]*len(current_tokens))
      tokenized_words.extend(current_tokens)
      final_token_label.extend([token]*len(current_tokens))

    if len(unnormalized_token_boxes)<max_seq_len:
        unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))
        
    if len(tokenized_words)< max_seq_len:
        tokenized_words.extend([pad_token_id]* (max_seq_len-len(tokenized_words)))

    if len(final_token_label)< max_seq_len:
        final_token_label.extend([pad_token_class]* (max_seq_len-len(final_token_label)))
        
    return unnormalized_token_boxes[:max_seq_len], tokenized_words[:max_seq_len], final_token_label[:max_seq_len]


def get_centroid(actual_bbox):
    centroid = []
    for i in actual_bbox:
        width = i[2] - i[0]
        height = i[3] - i[1]
        centroid.append([i[0] + width / 2, i[1] + height / 2])
    return centroid


def get_pad_token_id_start_index(words, encoding, tokenizer): 
#     assert len(words) < len(encoding["input_ids"])  This condition, was creating errors on some sample images
    for idx in range(len(encoding["input_ids"])):
        if encoding["input_ids"][idx] == tokenizer.pad_token_id:
            break
    return idx


def get_relative_distance(bboxes, centroids, pad_tokens_start_idx):

    a_rel_x = []
    a_rel_y = []

    for i in range(0, len(bboxes)-1):
        if i >= pad_tokens_start_idx:
            a_rel_x.append([0] * 8)
            a_rel_y.append([0] * 8)
            continue

        curr = bboxes[i]
        next = bboxes[i+1]

        a_rel_x.append(
            [
                curr[0],  # top left x
                curr[2],  # bottom right x
                curr[2] - curr[0],  # width
                next[0] - curr[0],  # diff top left x
                next[0] - curr[0],  # diff bottom left x
                next[2] - curr[2],  # diff top right x
                next[2] - curr[2],  # diff bottom right x
                centroids[i+1][0] - centroids[i][0],
            ]
        )

        a_rel_y.append(
            [
                curr[1],  # top left y
                curr[3],  # bottom right y
                curr[3] - curr[1],  # height
                next[1] - curr[1],  # diff top left y
                next[3] - curr[3],  # diff bottom left y
                next[1] - curr[1],  # diff top right y
                next[3] - curr[3],  # diff bottom right y
                centroids[i+1][1] - centroids[i][1],
            ]
        )

    # For the last word
    
    a_rel_x.append([0]*8)  
    a_rel_y.append([0]*8)

    return a_rel_x, a_rel_y

In [7]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
import torch
from torchvision.transforms import ToTensor
PAD_TOKEN_BOX = [0, 0, 0, 0]

def create_features(
        image,
        tokenizer,
        add_batch_dim=False,
        target_size=(512,384),  # This was the resolution used by the authors
        max_seq_length=512,
        bounding_box = None,
        words = None,
        token_labels = None
):

    # step 1: read original image and extract OCR entries
    original_image = image.convert("RGB")
    CLS_TOKEN_BOX = [0, 0, *original_image.size]    # Can be variable, but as per the paper, they have mentioned that it covers the whole image
    # step 2: resize image
    resized_image = original_image.resize(target_size)

    # step 3: tokenize words and get their bounding boxes (one word may split into multiple tokens), and accordingly their tokens
    unnormalized_token_boxes, tokenized_words,final_token_label = get_tokens_with_boxes(bounding_box, 
                                                     words, 
                                                     token_labels, 
                                                     tokenizer, 
                                                     pad_token_id = 0, 
                                                     pad_token_box = [0, 0, 0, 0], 
                                                     max_seq_len = 512, 
                                                     pad_token_class = 7)

    encoding = {}
    # step 5: add special tokens and truncate seq. to maximum length
    unnormalized_token_boxes = [CLS_TOKEN_BOX] + unnormalized_token_boxes[:-1]
    final_token_label = [7] + final_token_label[:-1]

    # add CLS token manually to avoid autom. addition of SEP too (as in the paper)
    encoding["input_ids"] = [tokenizer.cls_token_id] + tokenized_words[:-1]

    # step 6: Add bounding boxes to the encoding dict
    encoding["unnormalized_token_boxes"] = unnormalized_token_boxes
    # step 8: normalize the image
    encoding["resized_scaled_img"] = ToTensor()(resized_image)

    # step 10: rescale and align the bounding boxes to match the resized image size (typically 224x224)
    resized_and_aligned_bboxes = []

    for bbox in unnormalized_token_boxes:
        # performing the normalization of the bounding box
        resized_and_aligned_bboxes.append(resize_align_bbox(tuple(bbox), *original_image.size, *target_size))

    encoding["resized_and_aligned_bounding_boxes"] = resized_and_aligned_bboxes
    encoding['token_class'] = final_token_label

    # step 11: add the relative distances in the normalized grid
    bboxes_centroids = get_centroid(resized_and_aligned_bboxes)
    pad_token_start_index = get_pad_token_id_start_index(words, encoding, tokenizer)
    a_rel_x, a_rel_y = get_relative_distance(resized_and_aligned_bboxes, bboxes_centroids, pad_token_start_index)

    # step 12: convert all to tensors
    for k, v in encoding.items():
        encoding[k] = torch.as_tensor(encoding[k])

    encoding.update({
        "x_features": torch.as_tensor(a_rel_x, dtype=torch.int32),
        "y_features": torch.as_tensor(a_rel_y, dtype=torch.int32),
        })

    keys = ['resized_scaled_img', 'x_features','y_features','input_ids','resized_and_aligned_bounding_boxes', 'token_class']
    
    final_encoding = {k:encoding[k] for k in keys}
    
    del encoding
    return final_encoding

In [9]:
# encoding = create_features(
#     image = sample,
#     tokenizer = tokenizer,
#     bounding_box = train_ds[0]['bboxes'],
#     words = train_ds[0]['tokens'],
#     token_labels = train_ds[0]['ner_tags']
# )

In [10]:
# for key in list(encoding.keys()):
#   print_statement = '{0: <50}'.format(str(key) + " has a shape:")
#   print(print_statement, encoding[key].shape)

## Making the dataset

In [11]:
## Keys
train_ds[0].keys()

dict_keys(['id', 'tokens', 'bboxes', 'ner_tags', 'image'])

In [12]:
# sample = np.array(train_ds[0]['image'])
# sample = np.transpose(sample, (1, 2, 0)).astype(np.uint8)
# sample = Image.fromarray(sample)

## Visualizing the resized image
# target_size = (512, 384)
# resize_img = sample.resize(target_size)
# resize_img

In [13]:
## From here: https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/True_inference_with_LayoutLMv2ForTokenClassification_%2B_Gradio_demo.ipynb#scrollTo=eFgjNr9SNzzb

def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

In [14]:
class DocumentDataset(Dataset):

  def __init__(self, ds, tokenizer, max_len = 512, target_size = (512, 384)):
    self.ds = ds
    self.target_size = target_size
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.target_size = target_size

  def __len__(self):
    return len(self.ds)

  def __getitem__(self, idx):

    ## Loading the file
    sample_entry = self.ds[idx]
    sample = np.array(sample_entry['image'])
    sample = np.transpose(sample, (1, 2, 0)).astype(np.uint8)
    sample = Image.fromarray(sample)

    true_boxes = [unnormalize_box(box, 224, 224) for idx, box in enumerate(sample_entry['bboxes'])]  ## 224, 224 -> image resized shape
    
    encoding = create_features(
        image = sample,
        tokenizer = self.tokenizer,
        bounding_box = true_boxes,
        words = sample_entry['tokens'],
        token_labels = sample_entry['ner_tags']
    )

    return encoding

In [15]:
train_dataset = DocumentDataset(train_ds, tokenizer)
val_dataset = DocumentDataset(val_ds, tokenizer)

In [16]:
def collate_fn(data_bunch):

  '''
  A function for the dataloader to return a batch dict of given keys

  data_bunch: List of dictionary
  '''

  dict_data_bunch = {}

  for i in data_bunch:
    for (key, value) in i.items():
      if key not in dict_data_bunch:
        dict_data_bunch[key] = []
      dict_data_bunch[key].append(value)

  for key in list(dict_data_bunch.keys()):
      dict_data_bunch[key] = torch.stack(dict_data_bunch[key], axis = 0)

  return dict_data_bunch

In [17]:
# train_dl = DataLoader(train_dataset, batch_size = 4, collate_fn = collate_fn)
# val_dl = DataLoader(val_dataset, batch_size = 4, collate_fn = collate_fn)

In [18]:
# encoding = next(iter(train_dl))
# for key in list(encoding.keys()):
#   print_statement = '{0: <50}'.format(str(key) + " has a shape:")
#   print(print_statement, encoding[key].shape)

## Defining the DataModule

In [19]:
import pytorch_lightning as pl

class DataModule(pl.LightningDataModule):

  def __init__(self, train_dataset, val_dataset,  batch_size = 2):

    super(DataModule, self).__init__()
    self.train_dataset = train_dataset
    self.val_dataset = val_dataset
    self.batch_size = batch_size

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, 
                      collate_fn = collate_fn, shuffle = True)
  
  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size = self.batch_size,
                                  collate_fn = collate_fn, shuffle = False)

In [20]:
datamodule = DataModule(train_dataset, val_dataset)

## Modeling

In [21]:
## Setting some hyperparameters

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
  "coordinate_size": 96,              ## (768/8), 8 for each of the 8 coordinates of x, y
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "image_feature_pool_shape": [7, 7, 256],
  "intermediate_ff_size_factor": 4,
  "max_2d_position_embeddings": 1024,
  "max_position_embeddings": 512,
  "max_relative_positions": 8,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "shape_size": 96,
  "vocab_size": 30522,
  "layer_norm_eps": 1e-12,
  "classes": 7
}

In [22]:
class DocFormerForTokenCLS(nn.Module):
  
    def __init__(self, config):
      super(DocFormerForTokenCLS, self).__init__()

      self.resnet = ResNetFeatureExtractor()
      self.embeddings = DocFormerEmbeddings(config)
      self.lang_emb = LanguageFeatureExtractor()
      self.config = config
      self.dropout = nn.Dropout(config['hidden_dropout_prob'])
      self.linear_layer = nn.Linear(in_features = config['hidden_size'], out_features = config['classes'] + 1)
      self.encoder = DocFormerEncoder(config)

    def forward(self, batch_dict):

      x_feat = batch_dict['x_features']
      y_feat = batch_dict['y_features']

      token = batch_dict['input_ids']
      img = batch_dict['resized_scaled_img']

      v_bar_s, t_bar_s = self.embeddings(x_feat,y_feat)
      v_bar = self.resnet(img)
      t_bar = self.lang_emb(token)
      out = self.encoder(t_bar,v_bar,t_bar_s,v_bar_s)
      out = self.linear_layer(out)

      return out


In [23]:
## Defining pytorch lightning model
from sklearn.metrics import accuracy_score


class DocFormer(pl.LightningModule):

  def __init__(self, config , lr = 1e-3):
    super(DocFormer, self).__init__()
    
    self.config = config
    self.save_hyperparameters()
    self.docformer = DocFormerForTokenCLS(config)
    self.training_losses = []
    self.validation_losses = []


  def calculate_metrics(self, prediction, labels):

    ## Calculate the accuracy score between the prediction and ground label for a batch, with considering the pad sequence
    batch_size = len(prediction)
    ac_score = 0

    for (pred, gt) in zip(prediction, labels):
      ac_score+= accuracy_score(pred.cpu(), gt.cpu())
    ac_score = ac_score/batch_size
    return ac_score


  def forward(self, batch_dict):
    logits = self.docformer(batch_dict)
    return logits

  def training_step(self, batch, batch_idx):
    logits = self.forward(batch)

    ## https://discuss.huggingface.co/t/bertformaskedlm-s-loss-and-scores-how-the-loss-is-computed/607/2
    loss = nn.CrossEntropyLoss()(logits.view(-1,config['classes'] + 1), batch['token_class'].view(-1))
    _, preds = torch.max(logits, dim = -1)

    ## Calculating the accuracy score
    train_acc = self.calculate_metrics(preds, batch['token_class'])
    train_acc = torch.tensor(train_acc)

    ## Logging
    self.log('train_ce_loss', loss,prog_bar = True)
    self.log('train_acc', train_acc, prog_bar = True)
    self.training_losses.append(loss.item())

    return loss
  
  def validation_step(self, batch, batch_idx):
    logits = self.forward(batch)
    loss = nn.CrossEntropyLoss()(logits.view(-1, self.config['classes'] + 1), batch['token_class'].view(-1))
    _, preds = torch.max(logits, dim = -1)

    ## Validation Accuracy
    val_acc = self.calculate_metrics(preds.cpu(), batch['token_class'].cpu())
    val_acc = torch.tensor(val_acc)

    ## Logging
    self.log('val_ce_loss', loss, prog_bar = True)
    self.log('val_acc', val_acc, prog_bar = True)
    self.validation_losses.append(loss.item())

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr = self.hparams['lr'])

  def training_epoch_end(self, training_step_outputs):
    train_loss_mean = np.mean(self.training_losses)
    self.logger.experiment.add_scalar('training_loss', train_loss_mean, global_step=self.current_epoch)
    self.training_losses = []  # reset for next epoch

  def validation_epoch_end(self, validation_step_outputs):
    val_loss_mean = np.mean(self.training_losses)
    self.logger.experiment.add_scalar('validation_loss', val_loss_mean, global_step=self.current_epoch)
    self.validation_losses = []  # reset for next epoch

In [24]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

def main():
    datamodule = DataModule(train_dataset, val_dataset)
    docformer = DocFormer(config)

    checkpoint_callback = ModelCheckpoint(
        dirpath="./models", monitor="val_ce_loss", mode="min"
    )
    early_stopping_callback = EarlyStopping(
        monitor="val_ce_loss", patience=3, verbose=True, mode="min"
    )

    ## https://www.tutorialexample.com/implement-reproducibility-in-pytorch-lightning-pytorch-lightning-tutorial/
    pl.seed_everything(42, workers=True)
    trainer = pl.Trainer(
        default_root_dir="logs",
        gpus=(1 if torch.cuda.is_available() else 0),
        max_epochs=2,
        fast_dev_run=False,
        logger=pl.loggers.TensorBoardLogger("logs/", name="funsd_dataset", version=1),
        callbacks=[checkpoint_callback, early_stopping_callback],
        deterministic=True
    )
    trainer.fit(docformer, datamodule)

In [25]:
if __name__ == "__main__":
    main()

Downloading config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/432M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_ce_loss improved. New best score: 3.576


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_ce_loss improved by 0.511 >= min_delta = 0.0. New best score: 3.065
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
