# Introduction

In this notebook, we calculate the CER (Character Error Rate) of the Qwen2-VL generated data that we use for fine-tuning by comparing with the box text annotations that comes with the SROIE v2 dataset.

In [1]:
import glob
import jiwer

In [2]:
# Get the SROIE v2 box data.
sroie_box_train = glob.glob('../input/sroie_v2/SROIE2019/train/box/*.txt')
sroie_box_test = glob.glob('../input/sroie_v2/SROIE2019/test/box/*.txt')

sroie_box_train.sort()
sroie_box_test.sort()

print(len(sroie_box_train))
print(len(sroie_box_test))

626
347


In [3]:
print(sroie_box_train[0])
sample_file = open(sroie_box_train[0]).readlines()
print(sample_file)

../input/sroie_v2/SROIE2019/train/box/X00016469612.txt
['72,25,326,25,326,64,72,64,TAN WOON YANN\n', '50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND\n', '205,121,285,121,285,139,205,139,789417-W\n', '110,144,383,144,383,163,110,163,NO.53 55,57 & 59, JALAN SAGU 18,\n', '192,169,299,169,299,187,192,187,TAMAN DAYA,\n', '162,193,334,193,334,211,162,211,81100 JOHOR BAHRU,\n', '217,216,275,216,275,233,217,233,JOHOR.\n', '50,342,279,342,279,359,50,359,DOCUMENT NO : TD01167104\n', '50,372,96,372,96,390,50,390,DATE:\n', '165,372,342,372,342,389,165,389,25/12/2018 8:13:39 PM\n', '48,396,117,396,117,415,48,415,CASHIER:\n', '164,397,215,397,215,413,164,413,MANIS\n', '49,423,122,423,122,440,49,440,MEMBER:\n', '191,460,298,460,298,476,191,476,CASH BILL\n', '30,508,121,508,121,523,30,523,CODE/DESC\n', '200,507,247,507,247,521,200,521,PRICE\n', '276,506,306,506,306,522,276,522,DISC\n', '374,507,441,507,441,521,374,521,AMOUNT\n', '69,531,102,531,102,550,69,550,QTY\n', '221,531,247,531,247,5

In [4]:
# The eight comma separates the bounding boxes and the content.
final_content = ''
for line in sample_file:
    final_content += line.split(',')[8] 

print(final_content)

TAN WOON YANN
BOOK TA .K(TAMAN DAYA) SDN BND
789417-W
NO.53 55TAMAN DAYA81100 JOHOR BAHRUJOHOR.
DOCUMENT NO : TD01167104
DATE:
25/12/2018 8:13:39 PM
CASHIER:
MANIS
MEMBER:
CASH BILL
CODE/DESC
PRICE
DISC
AMOUNT
QTY
RM
RM
9556939040116
KF MODELLING CLAY KIDDY FISH
1 PC
*
9.000
0.00
9.00
TOTAL:
ROUR DING ADJUSTMENT:
0.00
ROUND D TOTAL (RM):
9.00
CASH
10.00
CHANGE
1.00
GOODS SOLD ARE NOT RETURNABLE OR
EXCHANGEABLE
***
***
THANK YOU
PLEASE COME AGAIN !
9.00



## Function to Calculate CER

In [5]:
def calculate_cer(sroie_boxes, ground_truth):
    """
    :param sroie_boxes: List containing the box/text data from the SROIE v2 dataset
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    :param ground_truth: VLM generated annotations
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    """

    error = jiwer.cer(sroie_boxes, ground_truth)
    print(f"CER: {error}")

## Calculate CER of Training Data

In [6]:
vlm_data = []

all_vlm_txt_train_paths = glob.glob('../input/qwen2_vl_2b_sroiev2_train_annots/*.txt')
all_vlm_txt_train_paths.sort()

for file_path in all_vlm_txt_train_paths:
    data = open(file_path).read()
    vlm_data.append(data.lower())

In [7]:
print(vlm_data[:2])

['tan woon yann\nbook ta.k (taman daya) sdn bhd\n789-417-w\nno.53 55,57 & 59, jalan sagu 18,\ntaman daya, 81100 johor bahru, johor.\ndocument no: td01167104\ndate: 25/12/2018 8:13:39 pm\ncashier: manis\nmember: 9556939040118\ncash bill\ncode/desc\nprice\nrm\namount\n9.00\n9.00\ntotal\n9.00\nrounding adjustment\n0.00\n0.00\nrounded total (rm):\n9.00\ncash\n10.00\nchange\n1.00\ngoods sold are not returnable or\nexchangeable\nplease come again!\nthank you\nplease come again!', 'tan woon yann']


In [8]:
sroie_box_data = []

for file_path in sroie_box_train:
    data = open(file_path).readlines()

    final_content = ''
    for line in data:
        final_content += line.split(',')[8]
    
    sroie_box_data.append(final_content.lower())

In [9]:
print(sroie_box_data[:2])

['tan woon yann\nbook ta .k(taman daya) sdn bnd\n789417-w\nno.53 55taman daya81100 johor bahrujohor.\ndocument no : td01167104\ndate:\n25/12/2018 8:13:39 pm\ncashier:\nmanis\nmember:\ncash bill\ncode/desc\nprice\ndisc\namount\nqty\nrm\nrm\n9556939040116\nkf modelling clay kiddy fish\n1 pc\n*\n9.000\n0.00\n9.00\ntotal:\nrour ding adjustment:\n0.00\nround d total (rm):\n9.00\ncash\n10.00\nchange\n1.00\ngoods sold are not returnable or\nexchangeable\n***\n***\nthank you\nplease come again !\n9.00\n', 'tan woon yann\nindah gift & home deco\n27taman johor jaya81100 johor bahrutel:07-3507405\nfax:07-3558160\nreceipt\n19/10/2018 20:49:59 #01\ncashier: cn\nlocation/sp: 05 /0531\nmb: mo26588\nroom no: 01\n050100035279\ndesc/item\nqty\nprice\namt(rm)\nst-privilege card/gd indah\n88888\n1\n10.00\n10.00\ngf-table lamp/stitch <i>\n62483\n1\n55.90\n55.90\n@disc\n10.00%\n-5.59\n#total qty\n2\ntotal amt................. rm\n60.31\nrounding adj............\n-0.01\nrm\n60.30\ncash.................... rm

In [10]:
calculate_cer(sroie_box_data, vlm_data)

CER: 0.5758303922942652


## Calculate CER of Test Data

In [11]:
vlm_data = []

all_vlm_txt_test_paths = glob.glob('../input/qwen2_vl_2b_sroiev2_test_annots/*.txt')
all_vlm_txt_test_paths.sort()

for file_path in all_vlm_txt_test_paths:
    data = open(file_path).read()
    vlm_data.append(data.lower())

In [13]:
print(len(vlm_data))

347


In [12]:
print(vlm_data[:2])

['***copy***\n\nojc marketing sdn bhd\n\nroc no: 538358-h\n\nno 2 & 4, jalan bayu 4, bandar seri alam, 81750 masai, johor\n\ntel:07-388 2218 fax:07-388 8218\n\nemail: ng@ojcgroup.com\n\ntax invoice\n\ninvoice no: pegiv-1030765\n\ndate: 15/01/2019 11:05:16 am\n\ncashier: ng chuan min\n\nsales person: fatin\n\nbill to: the peak quarry works\n\naddress:kings safety shoes kwd b05(35,552),(995,995)', 'tan chay yee']


In [34]:
sroie_box_data = []

for i, file_path in enumerate(sroie_box_test):
    # Try-except because there are a few emtpy files and an
    # UTF-8 encoding error in one of the files.
    try:
        data = open(file_path).readlines()
        final_content = ''
        for line in data:
            final_content += line.split(',')[8]
        
        sroie_box_data.append(final_content.lower())
    except:
        # sroie_box_data.append('\n')
        print(f"Erroneous index: {i}")

Erroneous index: 149
Erroneous index: 230


In [35]:
print(len(sroie_box_data))

345


In [36]:
# Pop the data from the same indices in the VLM generated data.
vlm_data.pop(149)
vlm_data.pop(230)

'kedai ubat & runcit hong ning sdn. bhd. (717833-p) (gst id no : 002006163456) no.8, jalan lang kuning, kepong baru, 52100 kuala lumpur. tel: 03-6273 2163 bill no : pos/268511 date : 24/12/16 2:28:22 pm cashier : admin payment : cash item qty price amount yi jin plan @2.83 g5 1 3.00 3.00 sr. tian qi @18.87 st100 1 20.00 20.00 sr. gardenia cr-choclt @0.80 955664132 1 0.85 0.85 sr. massimo due-coffee @0.80 955675553 1 0.85 0.85 sr. massimo due-coffee @0.80 955675553 1 0.85 0.85 sr. new form @ gst 6% : 75.55 cash paid : 25.55 card paid : 0.00 change : 0.00 round adj. : 0.00 gst summary amount tax sr. @ 6% 24.10 1.45 thank you please come again'

In [37]:
calculate_cer(sroie_box_data, vlm_data)

CER: 0.5829631883176148
