## Labeling for LayoutLMv3

In [None]:
import pandas as pd
import json

# Load your train.json
with open("../data/train.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# train.json has structure like:
# {
#   "id": ["X51005719856.jpg", ...],
#   "words": [["DOCUMENT", "NO", ":", "TD01167104"], ...],
#   "labels": [["O", "O", "O", "B-INVOICE_NO", "I-INVOICE_NO"], ...]
# }

# Flatten out into rows for easy inspection
rows = []
for doc_id, bboxes, words, labels in zip(df["file"], df["bboxes"], df["words"], df["labels"]):
    for word, bbox, label in zip(words, bboxes, labels):
        rows.append({"file": doc_id, "bboxes": bbox, "word": word, "label": label})

flat_df = pd.DataFrame(rows)

# Filter only invoice labels
invoice_df = flat_df[flat_df["label"].isin(["B-INVOICE_NO", "I-INVOICE_NO"])]


# append ground truth labels to df
with open('../data/labels.json', 'r') as f:
    labels = json.load(f)

label_df = pd.DataFrame(list(labels.items()), columns=['file', 'actual_label'])
result_df = pd.merge(flat_df, label_df, on='file', how='left')
result_df.head(20)

Unnamed: 0,file,bboxes,word,label,actual_label
0,X51006557117.jpg,"[35, 87, 590, 110]",GARDENIA BAKERIES (KL) SDN BHD (139386 X),O,7030F715
1,X51006557117.jpg,"[172, 109, 448, 133]","LOT 3, JALAN PELABUR 23/1,",O,7030F715
2,X51006557117.jpg,"[160, 132, 458, 157]","40300 SHAH ALAM, SELANGOR.",O,7030F715
3,X51006557117.jpg,"[120, 156, 305, 178]",TEL: 03- 55423228,O,7030F715
4,X51006557117.jpg,"[321, 157, 504, 179]",FAX:03- 55423213,O,7030F715
5,X51006557117.jpg,"[193, 180, 433, 202]",GST ID: 000381399040,O,7030F715
6,X51006557117.jpg,"[139, 220, 485, 256]",TAX INVOICE / ADJUSTMENT NOTE,O,7030F715
7,X51006557117.jpg,"[244, 279, 512, 312]",CASH,O,7030F715
8,X51006557117.jpg,"[244, 279, 512, 312]",INV,O,7030F715
9,X51006557117.jpg,"[244, 279, 512, 312]",NO,O,7030F715


In [11]:
# check result_df where label is not O
labeled_df = result_df[result_df['label'] != 'O']
labeled_df

Unnamed: 0,file,bboxes,word,label,actual_label
12,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
103,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
154,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
211,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
255,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
32301,X00016469669.jpg,"[118, 242, 202, 260]",01,B-INVOICE_ID,01-143008
32302,X00016469669.jpg,"[118, 242, 202, 260]",-,I-INVOICE_ID,01-143008
32303,X00016469669.jpg,"[118, 242, 202, 260]",143008,I-INVOICE_ID,01-143008
32357,X51005433494.jpg,"[340, 631, 581, 650]",2018030610100080498,B-INVOICE_ID,2018030610100080498


## Inspecting labels


In [None]:
# check where labeled_df is duplicated file (every file should have 1 B-INVOICE_ID + 1 or more I-INVOICE_ID here)
labeled_df[labeled_df.duplicated(subset=['file'], keep=False)].head(20)

Unnamed: 0,file,bboxes,word,label,actual_label
532,X51007339151.jpg,"[24, 948, 283, 969]",CS,B-INVOICE_ID,CS-SA-0096677
533,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
534,X51007339151.jpg,"[24, 948, 283, 969]",SA,I-INVOICE_ID,CS-SA-0096677
535,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
536,X51007339151.jpg,"[24, 948, 283, 969]",0096677,I-INVOICE_ID,CS-SA-0096677
598,X51005685355.jpg,"[147, 468, 266, 488]",1,B-INVOICE_ID,1-161696
599,X51005685355.jpg,"[147, 468, 266, 488]",-,I-INVOICE_ID,1-161696
600,X51005685355.jpg,"[147, 468, 266, 488]",161696,I-INVOICE_ID,1-161696
691,X51007339150.jpg,"[34, 974, 294, 997]",CS,B-INVOICE_ID,CS-SA-0097366
692,X51007339150.jpg,"[34, 974, 294, 997]",-,I-INVOICE_ID,CS-SA-0097366


In [13]:
## Every file should have 1 label. These two dataframes should match in length

# check where it is not duplicated
labeled_df[~labeled_df.duplicated(subset=['file'], keep=False)]

# Check where word == actual_label
labeled_df[labeled_df['word'] == labeled_df['actual_label']]

Unnamed: 0,file,bboxes,word,label,actual_label
12,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
103,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
154,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
211,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
255,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
32020,X51007846370.jpg,"[1351, 2229, 2186, 2308]",OR18061602170510,B-INVOICE_ID,OR18061602170510
32062,X51006913070.jpg,"[1455, 2097, 2299, 2165]",OR18052402170329,B-INVOICE_ID,OR18052402170329
32182,X51005677332.jpg,"[260, 685, 505, 723]",CS00012693,B-INVOICE_ID,CS00012693
32357,X51005433494.jpg,"[340, 631, 581, 650]",2018030610100080498,B-INVOICE_ID,2018030610100080498


## Edge Cases

In [14]:
# check whwre file = X51009453801
labeled_df[labeled_df['file'] == 'X51005719899.jpg']

# edge case
result_df[(result_df['file'] == 'X51006619545.jpg')].head(20)


Unnamed: 0,file,bboxes,word,label,actual_label
27230,X51006619545.jpg,"[47, 280, 494, 323]",COSWAY (M) SDN BHD (50118-A),O,K074-001096
27231,X51006619545.jpg,"[51, 323, 621, 365]","2ND FLOOR, WISMA COSWAY, JALAN RAJA CHULAN,",O,K074-001096
27232,X51006619545.jpg,"[47, 362, 437, 403]","50200 KUALA LUMPUR, MALAYSIA.",O,K074-001096
27233,X51006619545.jpg,"[52, 401, 325, 435]",TEL :603-2030 1000,O,K074-001096
27234,X51006619545.jpg,"[46, 438, 318, 472]",FAX :603-2142 5587,O,K074-001096
27235,X51006619545.jpg,"[47, 473, 407, 520]",EMAIL: INFO@COSWAY.COM.MY,O,K074-001096
27236,X51006619545.jpg,"[49, 542, 579, 580]",GST REGISTRATION NO : 000743903232,O,K074-001096
27237,X51006619545.jpg,"[210, 643, 611, 683]",TAX INVOICE (COSWAY),O,K074-001096
27238,X51006619545.jpg,"[94, 736, 300, 768]",CENTER : K074,O,K074-001096
27239,X51006619545.jpg,"[562, 738, 770, 771]",USER ID : K074,O,K074-001096


In [15]:
# give me all the files in results_df that only have O labels

only_o_files = result_df.groupby('file').filter(lambda x: all(label == 'O' for label in x['label']))
only_o_files['file'].unique()

# this should be 0 is all is accounted for

array([], dtype=object)

In [16]:
### CHECKING EDGE CASES mentioned in heuristics doc
r = result_df[(result_df['file'] == 'X51006619545.jpg')]
r[r['label'] != 'O' ]

Unnamed: 0,file,bboxes,word,label,actual_label
27246,X51006619545.jpg,"[89, 835, 600, 878]",K074,B-INVOICE_ID,K074-001096
27247,X51006619545.jpg,"[89, 835, 600, 878]",-,I-INVOICE_ID,K074-001096
27248,X51006619545.jpg,"[89, 835, 600, 878]",001096,I-INVOICE_ID,K074-001096


In [17]:
# check result_df for to see if there are multiple values of B-INVOICE_NO for a single file
result_df[result_df['label'] == 'B-INVOICE_NO'].groupby('file').filter(lambda x: len(x) > 1)
## each file should only have 1 B-INVOICE_NO. This should be empty

Unnamed: 0,file,bboxes,word,label,actual_label


## Checking Model Layers

In [18]:
from transformers import LayoutLMv3ForTokenClassification
base_model: str = "microsoft/layoutlmv3-base"
base = LayoutLMv3ForTokenClassification.from_pretrained(base_model)
print(base)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder