## Labeling for LayoutLMv3

In [30]:
import pandas as pd
import json

# Load your train.json
with open("../data/train.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Assume train.json has structure like:
# {
#   "id": ["X51005719856.jpg", ...],
#   "words": [["DOCUMENT", "NO", ":", "TD01167104"], ...],
#   "labels": [["O", "O", "O", "B-INVOICE_NO", "I-INVOICE_NO"], ...]
# }

# Flatten out into rows for easy inspection
rows = []
for doc_id, bboxes, words, labels in zip(df["file"], df["bboxes"], df["words"], df["labels"]):
    for word, bbox, label in zip(words, bboxes, labels):
        rows.append({"file": doc_id, "bboxes": bbox, "word": word, "label": label})

flat_df = pd.DataFrame(rows)

# Filter only invoice labels
invoice_df = flat_df[flat_df["label"].isin(["B-INVOICE_NO", "I-INVOICE_NO"])]

# print(invoice_df.head(20))
flat_df.head(50)

# append real labels to the dataframe for comparison. The labels are in lab
import json
with open('../data/labels.json', 'r') as f:
    labels = json.load(f)

# Convert dictionary to DataFrame
label_df = pd.DataFrame(list(labels.items()), columns=['file', 'actual_label'])

result_df = pd.merge(flat_df, label_df, on='file', how='left')
result_df.head(10)

Unnamed: 0,file,bboxes,word,label,actual_label
0,X51006557117.jpg,"[35, 87, 590, 110]",GARDENIA,O,7030F715
1,X51006557117.jpg,"[35, 87, 590, 110]",BAKERIES,O,7030F715
2,X51006557117.jpg,"[35, 87, 590, 110]",(KL),O,7030F715
3,X51006557117.jpg,"[35, 87, 590, 110]",SDN,O,7030F715
4,X51006557117.jpg,"[35, 87, 590, 110]",BHD,O,7030F715
5,X51006557117.jpg,"[35, 87, 590, 110]",(139386,O,7030F715
6,X51006557117.jpg,"[35, 87, 590, 110]",X),O,7030F715
7,X51006557117.jpg,"[172, 109, 448, 133]",LOT,O,7030F715
8,X51006557117.jpg,"[172, 109, 448, 133]",3,O,7030F715
9,X51006557117.jpg,"[172, 109, 448, 133]",JALAN,O,7030F715


In [21]:
# check result_df where label is no 0
labeled_df = result_df[result_df['label'] != 'O']
labeled_df

Unnamed: 0,file,bboxes,word,label,actual_label
36,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
227,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
327,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
437,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
533,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
72325,X51005806703.jpg,"[277, 412, 484, 437]",084227,I-INVOICE_ID,T04-18/084227
72468,X00016469669.jpg,"[118, 242, 202, 260]",01,B-INVOICE_ID,01-143008
72469,X00016469669.jpg,"[118, 242, 202, 260]",-,I-INVOICE_ID,01-143008
72470,X00016469669.jpg,"[118, 242, 202, 260]",143008,I-INVOICE_ID,01-143008


In [26]:
# check where labeled_df is duplicated file
labeled_df[labeled_df.duplicated(subset=['file'], keep=False)].head(20)

Unnamed: 0,file,bboxes,word,label,actual_label
1133,X51007339151.jpg,"[24, 948, 283, 969]",CS,B-INVOICE_ID,CS-SA-0096677
1134,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
1135,X51007339151.jpg,"[24, 948, 283, 969]",SA,I-INVOICE_ID,CS-SA-0096677
1136,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
1137,X51007339151.jpg,"[24, 948, 283, 969]",0096677,I-INVOICE_ID,CS-SA-0096677
1347,X51005685355.jpg,"[147, 468, 266, 488]",1,B-INVOICE_ID,1-161696
1348,X51005685355.jpg,"[147, 468, 266, 488]",-,I-INVOICE_ID,1-161696
1349,X51005685355.jpg,"[147, 468, 266, 488]",161696,I-INVOICE_ID,1-161696
1534,X51007339150.jpg,"[34, 974, 294, 997]",CS,B-INVOICE_ID,CS-SA-0097366
1535,X51007339150.jpg,"[34, 974, 294, 997]",-,I-INVOICE_ID,CS-SA-0097366


In [None]:
# check where it is not duplicated
labeled_df[~labeled_df.duplicated(subset=['file'], keep=False)]

# Check where word == actual_label
labeled_df[labeled_df['word'] == labeled_df['actual_label']]

# these two should match

Unnamed: 0,file,bboxes,word,label,actual_label
36,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
227,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
327,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
437,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
533,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
71754,X51006414631.jpg,"[1609, 1819, 2420, 1878]",OR18041802170465,B-INVOICE_ID,OR18041802170465
71865,X51007846370.jpg,"[1351, 2229, 2186, 2308]",OR18061602170510,B-INVOICE_ID,OR18061602170510
71963,X51006913070.jpg,"[1455, 2097, 2299, 2165]",OR18052402170329,B-INVOICE_ID,OR18052402170329
72216,X51005677332.jpg,"[260, 685, 505, 723]",CS00012693,B-INVOICE_ID,CS00012693


In [28]:
# check whwre file = X51009453801
labeled_df[labeled_df['file'] == 'X51005719899.jpg']


Unnamed: 0,file,bboxes,word,label,actual_label
34087,X51005719899.jpg,"[58, 190, 777, 232]",1,B-INVOICE_ID,1
