## Labeling for LayoutLMv3

In [81]:
import pandas as pd
import json

# Load your train.json
with open("../data/train.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Assume train.json has structure like:
# {
#   "id": ["X51005719856.jpg", ...],
#   "words": [["DOCUMENT", "NO", ":", "TD01167104"], ...],
#   "labels": [["O", "O", "O", "B-INVOICE_NO", "I-INVOICE_NO"], ...]
# }

# Flatten out into rows for easy inspection
rows = []
for doc_id, bboxes, words, labels in zip(df["file"], df["bboxes"], df["words"], df["labels"]):
    for word, bbox, label in zip(words, bboxes, labels):
        rows.append({"file": doc_id, "bboxes": bbox, "word": word, "label": label})

flat_df = pd.DataFrame(rows)

# Filter only invoice labels
invoice_df = flat_df[flat_df["label"].isin(["B-INVOICE_NO", "I-INVOICE_NO"])]

# print(invoice_df.head(20))
flat_df.head(50)

# append real labels to the dataframe for comparison. The labels are in lab
import json
with open('../data/labels.json', 'r') as f:
    labels = json.load(f)

# Convert dictionary to DataFrame
label_df = pd.DataFrame(list(labels.items()), columns=['file', 'actual_label'])

result_df = pd.merge(flat_df, label_df, on='file', how='left')
result_df.head(10)

Unnamed: 0,file,bboxes,word,label,actual_label
0,X51006557117.jpg,"[35, 87, 590, 110]",GARDENIA,O,7030F715
1,X51006557117.jpg,"[35, 87, 590, 110]",BAKERIES,O,7030F715
2,X51006557117.jpg,"[35, 87, 590, 110]",(,O,7030F715
3,X51006557117.jpg,"[35, 87, 590, 110]",KL,O,7030F715
4,X51006557117.jpg,"[35, 87, 590, 110]",),O,7030F715
5,X51006557117.jpg,"[35, 87, 590, 110]",SDN,O,7030F715
6,X51006557117.jpg,"[35, 87, 590, 110]",BHD,O,7030F715
7,X51006557117.jpg,"[35, 87, 590, 110]",(,O,7030F715
8,X51006557117.jpg,"[35, 87, 590, 110]",139386,O,7030F715
9,X51006557117.jpg,"[35, 87, 590, 110]",X,O,7030F715


In [82]:
# check result_df where label is no 0
labeled_df = result_df[result_df['label'] != 'O']
labeled_df

Unnamed: 0,file,bboxes,word,label,actual_label
47,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
317,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
474,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
623,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
775,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
101973,X00016469669.jpg,"[118, 242, 202, 260]",01,B-INVOICE_ID,01-143008
101974,X00016469669.jpg,"[118, 242, 202, 260]",-,I-INVOICE_ID,01-143008
101975,X00016469669.jpg,"[118, 242, 202, 260]",143008,I-INVOICE_ID,01-143008
102135,X51005433494.jpg,"[340, 631, 581, 650]",2018030610100080498,B-INVOICE_ID,2018030610100080498


In [83]:
# check where labeled_df is duplicated file
labeled_df[labeled_df.duplicated(subset=['file'], keep=False)].head(20)

Unnamed: 0,file,bboxes,word,label,actual_label
1625,X51007339151.jpg,"[24, 948, 283, 969]",CS,B-INVOICE_ID,CS-SA-0096677
1626,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
1627,X51007339151.jpg,"[24, 948, 283, 969]",SA,I-INVOICE_ID,CS-SA-0096677
1628,X51007339151.jpg,"[24, 948, 283, 969]",-,I-INVOICE_ID,CS-SA-0096677
1629,X51007339151.jpg,"[24, 948, 283, 969]",0096677,I-INVOICE_ID,CS-SA-0096677
1883,X51005685355.jpg,"[147, 468, 266, 488]",1,B-INVOICE_ID,1-161696
1884,X51005685355.jpg,"[147, 468, 266, 488]",-,I-INVOICE_ID,1-161696
1885,X51005685355.jpg,"[147, 468, 266, 488]",161696,I-INVOICE_ID,1-161696
2158,X51007339150.jpg,"[34, 974, 294, 997]",CS,B-INVOICE_ID,CS-SA-0097366
2159,X51007339150.jpg,"[34, 974, 294, 997]",-,I-INVOICE_ID,CS-SA-0097366


In [84]:
# check where it is not duplicated
labeled_df[~labeled_df.duplicated(subset=['file'], keep=False)]

# Check where word == actual_label
labeled_df[labeled_df['word'] == labeled_df['actual_label']]

# these two should match

Unnamed: 0,file,bboxes,word,label,actual_label
47,X51006557117.jpg,"[244, 279, 512, 312]",7030F715,B-INVOICE_ID,7030F715
317,X51005711441.jpg,"[38, 649, 322, 693]",1054650,B-INVOICE_ID,1054650
474,X51005806685.jpg,"[158, 369, 252, 392]",389772,B-INVOICE_ID,389772
623,X51008099041.jpg,"[48, 645, 331, 685]",1219461,B-INVOICE_ID,1219461
775,X51005303661.jpg,"[462, 421, 745, 456]",LCS03908,B-INVOICE_ID,LCS03908
...,...,...,...,...,...
101075,X51007846370.jpg,"[1351, 2229, 2186, 2308]",OR18061602170510,B-INVOICE_ID,OR18061602170510
101214,X51006913070.jpg,"[1455, 2097, 2299, 2165]",OR18052402170329,B-INVOICE_ID,OR18052402170329
101607,X51005677332.jpg,"[260, 685, 505, 723]",CS00012693,B-INVOICE_ID,CS00012693
102135,X51005433494.jpg,"[340, 631, 581, 650]",2018030610100080498,B-INVOICE_ID,2018030610100080498


In [85]:
# check whwre file = X51009453801
labeled_df[labeled_df['file'] == 'X51005719899.jpg']

# edge case
result_df[(result_df['file'] == 'X51006619545.jpg')].head(60)


Unnamed: 0,file,bboxes,word,label,actual_label
86006,X51006619545.jpg,"[47, 280, 494, 323]",COSWAY,O,K074-001096
86007,X51006619545.jpg,"[47, 280, 494, 323]",(,O,K074-001096
86008,X51006619545.jpg,"[47, 280, 494, 323]",M,O,K074-001096
86009,X51006619545.jpg,"[47, 280, 494, 323]",),O,K074-001096
86010,X51006619545.jpg,"[47, 280, 494, 323]",SDN,O,K074-001096
86011,X51006619545.jpg,"[47, 280, 494, 323]",BHD,O,K074-001096
86012,X51006619545.jpg,"[47, 280, 494, 323]",(,O,K074-001096
86013,X51006619545.jpg,"[47, 280, 494, 323]",50118,O,K074-001096
86014,X51006619545.jpg,"[47, 280, 494, 323]",-,O,K074-001096
86015,X51006619545.jpg,"[47, 280, 494, 323]",A,O,K074-001096


In [None]:
# give me all the files in results_df that only have O labels

only_o_files = result_df.groupby('file').filter(lambda x: all(label == 'O' for label in x['label']))
only_o_files['file'].unique()

# this should be 0 is all is accounted for

array([], dtype=object)

In [97]:
### CHECKING EDGE CASES mentioned in heuristics doc
r = result_df[(result_df['file'] == 'X51006619545.jpg')]
r[r['label'] != 'O' ]

Unnamed: 0,file,bboxes,word,label,actual_label
86078,X51006619545.jpg,"[89, 835, 600, 878]",K074,B-INVOICE_ID,K074-001096
86079,X51006619545.jpg,"[89, 835, 600, 878]",-,I-INVOICE_ID,K074-001096
86080,X51006619545.jpg,"[89, 835, 600, 878]",001096,I-INVOICE_ID,K074-001096


In [None]:
# check result_df for to see if there are multiple values of B-INVOICE_NO for a single file
result_df[result_df['label'] == 'B-INVOICE_NO'].groupby('file').filter(lambda x: len(x) > 1)
## each file should only have 1 B-INVOICE_NO

Unnamed: 0,file,bboxes,word,label,actual_label
