In [1]:
import json
import pandas as pd

with open('C:/Users/Acer/Documents/Fusemachines/LayoutLM/dataset/train.json') as f:
  data = json.load(f)

In [2]:
data.keys()

dict_keys(['qas'])

In [3]:
type(data)

dict

In [None]:
df = pd.DataFrame(data['qas'])
df.head()

In [None]:
from PIL import Image

example = data['qas'][10]

root_dir = 'C:/Users/Acer/Documents/Fusemachines/LayoutLM/dataset/'
image = Image.open(root_dir + "documents/" + example['image'])
image

In [None]:
for k,v in example.items():
  print(k + ":", v)

In [7]:
type(df['answers'])

pandas.core.series.Series

In [8]:
ocr_root_dir = root_dir + "ocr_result/"

with open(ocr_root_dir + example['image'] [:-3] + "json") as f:
  ocr = json.load(f)

In [9]:
ocr.keys()

dict_keys(['height', 'width', 'lines'])

In [None]:
words = ''
for item in ocr['lines']:
    words += item['text']

print(words)

In [11]:
from datasets import Dataset

dataset = Dataset.from_pandas(df.iloc[:50])

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset[0]

In [13]:
from transformers import LayoutLMv2FeatureExtractor

feature_extractor = LayoutLMv2FeatureExtractor()

def get_ocr_words_and_boxes(examples):
    
  images = [Image.open(root_dir + "documents/" + image_file).convert("RGB") for image_file in examples['image']]
  
  # resize every image to 224x224 + apply tesseract to get words + normalized boxes
  encoded_inputs = feature_extractor(images)

  examples['image'] = encoded_inputs.pixel_values
  examples['words'] = encoded_inputs.words
  examples['boxes'] = encoded_inputs.boxes

  return examples



In [14]:
dataset_with_ocr = dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map: 100%|██████████| 50/50 [02:04<00:00,  2.49s/ examples]


In [None]:
print(dataset_with_ocr[0]['words'])
print(dataset_with_ocr[0]['boxes'])
print("-----")
print(dataset_with_ocr[5]['words'])
print(dataset_with_ocr[5]['boxes'])

In [16]:
dataset_with_ocr[0].keys()

dict_keys(['question_id', 'question', 'answers', 'image', 'words', 'boxes'])

In [17]:
def subfinder(words_list, answer_list):  
    matches = []
    start_indices = []
    end_indices = []
    for idx, i in enumerate(range(len(words_list))):
        if words_list[i] == answer_list[0] and words_list[i:i+len(answer_list)] == answer_list:
            matches.append(answer_list)
            start_indices.append(idx)
            end_indices.append(idx + len(answer_list) - 1)
    if matches:
      return matches[0], start_indices[0], end_indices[0]
    else:
      return None, 0, 0

In [18]:
question = "where is it located?"
words = ["this", "is", "located", "in", "the", "university", "of", "california", "in", "the", "US"]
boxes = [[1000,1000,1000,1000] for _ in range(len(words))]
answer = "university of california"

In [19]:
model_checkpoint = "microsoft/layoutlmv2-base-uncased"
batch_size = 16

In [20]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [21]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [22]:
encoding = tokenizer(question, words, boxes=boxes)

In [23]:
tokenizer.decode(encoding.input_ids)

'[CLS] where is it located? [SEP] this is located in the university of california in the us [SEP]'

In [24]:
print(encoding.word_ids())

[None, 0, 1, 2, 3, 3, None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]


In [25]:
print(encoding.sequence_ids())

[None, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]


In [26]:
match, word_idx_start, word_idx_end = subfinder(words, answer.split())

In [27]:
print("Match:", match)
print("Word idx start:", word_idx_start)
print("Word idx end:", word_idx_end)

Match: ['university', 'of', 'california']
Word idx start: 5
Word idx end: 7


In [28]:
sequence_ids = encoding.sequence_ids()

# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

# End token index of the current span in the text.
token_end_index = len(encoding.input_ids) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

print("Token start index:", token_start_index)
print("Token end index:", token_end_index)
print(tokenizer.decode(encoding.input_ids[token_start_index:token_end_index+1]))

word_ids = encoding.word_ids()[token_start_index:token_end_index+1]
print("Word ids:", word_ids)
for id in word_ids:
  if id == word_idx_start:
    start_position = token_start_index 
  else:
    token_start_index += 1

for id in word_ids[::-1]:
  if id == word_idx_end:
    end_position = token_end_index 
  else:
    token_end_index -= 1

print(start_position)
print(end_position)
print("Reconstructed answer:", tokenizer.decode(encoding.input_ids[start_position:end_position+1]))

Token start index: 7
Token end index: 17
this is located in the university of california in the us
Word ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
12
14
Reconstructed answer: university of california


In [29]:
def encode_dataset(examples, max_length=512):
  # take a batch 
  questions = examples['question']
  words = examples['words']
  boxes = examples['boxes']

  # encode it
  encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)

  # next, add start_positions and end_positions
  start_positions = []
  end_positions = []
  answers = examples['answers']
  # for every example in the batch:
  for batch_index in range(len(answers)):
    print("Batch index:", batch_index)
    cls_index = encoding.input_ids[batch_index].index(tokenizer.cls_token_id)
    # try to find one of the answers in the context, return first match
    words_example = [word.lower() for word in words[batch_index]]
    for answer in answers[batch_index]:
      match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
      if match:
        break
    # EXPERIMENT (to account for when OCR context and answer don't perfectly match):
    if not match:
      for answer in answers[batch_index]:
        for i in range(len(answer)):
          # drop the ith character from the answer
          answer_i = answer[:i] + answer[i+1:]
          # check if we can find this one in the context
          match, word_idx_start, word_idx_end = subfinder(words_example, answer_i.lower().split())
          if match:
            break
    # END OF EXPERIMENT 
    
    if match:
      sequence_ids = encoding.sequence_ids(batch_index)
      # Start token index of the current span in the text.
      token_start_index = 0
      while sequence_ids[token_start_index] != 1:
          token_start_index += 1

      # End token index of the current span in the text.
      token_end_index = len(encoding.input_ids[batch_index]) - 1
      while sequence_ids[token_end_index] != 1:
          token_end_index -= 1
      
      word_ids = encoding.word_ids(batch_index)[token_start_index:token_end_index+1]
      for id in word_ids:
        if id == word_idx_start:
          start_positions.append(token_start_index)
          break
        else:
          token_start_index += 1

      for id in word_ids[::-1]:
        if id == word_idx_end:
          end_positions.append(token_end_index)
          break
        else:
          token_end_index -= 1
      
      print("Verifying start position and end position:")
      print("True answer:", answer)
      start_position = start_positions[batch_index]
      end_position = end_positions[batch_index]
      reconstructed_answer = tokenizer.decode(encoding.input_ids[batch_index][start_position:end_position+1])
      print("Reconstructed answer:", reconstructed_answer)
      print("-----------")
    
    else:
      print("Answer not found in context")
      print("-----------")
      start_positions.append(cls_index)
      end_positions.append(cls_index)
  
  encoding['image'] = examples['image']
  encoding['start_positions'] = start_positions
  encoding['end_positions'] = end_positions

  return encoding

In [30]:
type(dataset_with_ocr['answers'])

list

In [None]:
from datasets import Features, Sequence, Value, Array2D, Array3D

# we need to define custom features
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'image': Array3D(dtype="int64", shape=(3, 224, 224)),
    'start_positions': Value(dtype='int64'),
    'end_positions': Value(dtype='int64'),
})

encoded_dataset = dataset_with_ocr.map(encode_dataset, batched=True, batch_size=2, 
                                       remove_columns=dataset_with_ocr.column_names,
                                       features=features)

In [32]:
encoded_dataset

Dataset({
    features: ['input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'start_positions', 'end_positions'],
    num_rows: 50
})

In [33]:
encoded_dataset

Dataset({
    features: ['input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'start_positions', 'end_positions'],
    num_rows: 50
})

In [None]:
idx = 44

tokenizer.decode(encoded_dataset['input_ids'][idx])

In [None]:
dataset['image'][idx]

In [None]:
image = Image.open('C:/Users/Acer/Documents/Fusemachines/LayoutLM/dataset/documents/' + dataset['image'][idx])
image

In [37]:
start_position = encoded_dataset['start_positions'][idx]
end_position = encoded_dataset['end_positions'][idx]
if start_position != 0:
  print(tokenizer.decode(encoded_dataset['input_ids'][idx][start_position: end_position+1]))
else:
  print("Answer not found in context")

2023 - 0009718


In [38]:
len(encoded_dataset)

50

In [39]:
encoded_dataset.features

{'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'bbox': Array2D(shape=(512, 4), dtype='int64', id=None),
 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'image': Array3D(shape=(3, 224, 224), dtype='int64', id=None),
 'start_positions': Value(dtype='int64', id=None),
 'end_positions': Value(dtype='int64', id=None)}

In [40]:
import torch

encoded_dataset.set_format(type="torch")
dataloader = torch.utils.data.DataLoader(encoded_dataset, batch_size=4)
batch = next(iter(dataloader))

for k,v in batch.items():
  print(k, v.shape)

input_ids torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
attention_mask torch.Size([4, 512])
token_type_ids torch.Size([4, 512])
image torch.Size([4, 3, 224, 224])
start_positions torch.Size([4])
end_positions torch.Size([4])


In [None]:
idx = 2

tokenizer.decode(batch['input_ids'][2])

In [42]:
start_position = batch['start_positions'][idx].item()
end_position = batch['end_positions'][idx].item()

tokenizer.decode(batch['input_ids'][idx][start_position:end_position+1])

'2023 - 0009658'

In [43]:
model_checkpoint = "microsoft/layoutlmv2-base-uncased"
batch_size = 16

In [44]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

  warn(
Some weights of LayoutLMv2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['layoutlmv2.visual_segment_embedding', 'qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()
for epoch in range(20):  # loop over the dataset multiple times
   for idx, batch in enumerate(dataloader):
        # get the inputs;
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        bbox = batch["bbox"].to(device)
        image = batch["image"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                       bbox=bbox, image=image, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()



Loss: 6.245140075683594
Loss: 6.24925422668457
Loss: 6.1637163162231445
Loss: 6.036060333251953
Loss: 6.2166547775268555
Loss: 5.975975036621094
Loss: 5.965182304382324
Loss: 5.935624122619629
Loss: 6.0863938331604
Loss: 5.713127613067627
Loss: 5.720980167388916
Loss: 5.600113391876221
Loss: 5.785351753234863
Loss: 5.413821220397949
Loss: 5.594992637634277
Loss: 5.133309364318848
Loss: 5.099974632263184
Loss: 5.306059837341309
Loss: 4.982330799102783
Loss: 4.888637542724609
Loss: 4.8870158195495605
Loss: 4.954538345336914
Loss: 4.551721572875977
Loss: 4.6879682540893555
Loss: 4.406125068664551
Loss: 4.6027421951293945
Loss: 4.233583927154541
Loss: 5.174217224121094
Loss: 4.193729400634766
Loss: 4.274009704589844
Loss: 4.46563720703125
Loss: 4.168525695800781
Loss: 4.12448787689209
Loss: 4.149589538574219
Loss: 4.568869113922119
Loss: 3.9291772842407227
Loss: 4.086633682250977
Loss: 3.9285855293273926
Loss: 4.152186393737793
Loss: 3.874512195587158
Loss: 4.472742080688477
Loss: 3.853192