# Introduction

In this notebook, we calculate the CER (Character Error Rate) of the Qwen2-VL generated data that we use for fine-tuning by comparing with the box text annotations that comes with the SROIE v2 dataset.

In [1]:
import glob
import jiwer

In [2]:
# Transforms to reduce multiple new lines to single new line.
tfms_multline = jiwer.Compose(
    [
        jiwer.SubstituteWords({"\n\n": "\n"})
    ]
)

In [3]:
# Get the SROIE v2 box data.
sroie_box_train = glob.glob('../input/sroie_v2/SROIE2019/train/box/*.txt')
sroie_box_test = glob.glob('../input/sroie_v2/SROIE2019/test/box/*.txt')

sroie_box_train.sort()
sroie_box_test.sort()

print(len(sroie_box_train))
print(len(sroie_box_test))

626
347


In [4]:
print(sroie_box_train[0])
sample_file = open(sroie_box_train[0]).readlines()
print(sample_file)

../input/sroie_v2/SROIE2019/train/box/X00016469612.txt
['72,25,326,25,326,64,72,64,TAN WOON YANN\n', '50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND\n', '205,121,285,121,285,139,205,139,789417-W\n', '110,144,383,144,383,163,110,163,NO.53 55,57 & 59, JALAN SAGU 18,\n', '192,169,299,169,299,187,192,187,TAMAN DAYA,\n', '162,193,334,193,334,211,162,211,81100 JOHOR BAHRU,\n', '217,216,275,216,275,233,217,233,JOHOR.\n', '50,342,279,342,279,359,50,359,DOCUMENT NO : TD01167104\n', '50,372,96,372,96,390,50,390,DATE:\n', '165,372,342,372,342,389,165,389,25/12/2018 8:13:39 PM\n', '48,396,117,396,117,415,48,415,CASHIER:\n', '164,397,215,397,215,413,164,413,MANIS\n', '49,423,122,423,122,440,49,440,MEMBER:\n', '191,460,298,460,298,476,191,476,CASH BILL\n', '30,508,121,508,121,523,30,523,CODE/DESC\n', '200,507,247,507,247,521,200,521,PRICE\n', '276,506,306,506,306,522,276,522,DISC\n', '374,507,441,507,441,521,374,521,AMOUNT\n', '69,531,102,531,102,550,69,550,QTY\n', '221,531,247,531,247,5

In [5]:
# The eight commas separates the bounding boxes and the content.
final_content = ''
for line in sample_file:
    final_content += line.split(',')[8] 

print(final_content)

TAN WOON YANN
BOOK TA .K(TAMAN DAYA) SDN BND
789417-W
NO.53 55TAMAN DAYA81100 JOHOR BAHRUJOHOR.
DOCUMENT NO : TD01167104
DATE:
25/12/2018 8:13:39 PM
CASHIER:
MANIS
MEMBER:
CASH BILL
CODE/DESC
PRICE
DISC
AMOUNT
QTY
RM
RM
9556939040116
KF MODELLING CLAY KIDDY FISH
1 PC
*
9.000
0.00
9.00
TOTAL:
ROUR DING ADJUSTMENT:
0.00
ROUND D TOTAL (RM):
9.00
CASH
10.00
CHANGE
1.00
GOODS SOLD ARE NOT RETURNABLE OR
EXCHANGEABLE
***
***
THANK YOU
PLEASE COME AGAIN !
9.00



## Function to Calculate CER

In [6]:
def calculate_cer(sroie_boxes, ground_truth):
    """
    :param sroie_boxes: List containing the box/text data from the SROIE v2 dataset
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    :param ground_truth: VLM generated annotations
        e.g. ['tan woon yann\nbook ta.k', 'are not returnable or']
    """

    error = jiwer.cer(sroie_boxes, ground_truth)
    print(f"CER: {error}")

## Calculate CER of Training Data

In [7]:
vlm_data = []

all_vlm_txt_train_paths = glob.glob('../input/qwen2_vl_7b_sroiev2_train_annots/*.txt')
all_vlm_txt_train_paths.sort()

for file_path in all_vlm_txt_train_paths:
    data = open(file_path).read()
    vlm_data.append(data.lower())

In [8]:
print(vlm_data[0])

tan woon yann

book ta _k (taman daya) sdn bhd

789417-w

no.53 55,57 & 59, jalan sagu 18,

taman daya,

81100 johor bahru,

johor.

document no : td01167104

date : 25/12/2018 8:13:39 pm

cashier : manis

member :

cash bill

code/desc price disc amount rm

95569390-0118 kf modelling clay kiddy fish 9.000 0.00 9.00

total : 9.00

rounding adjustment : 0.00

round d total (rm) : 9.00

cash change 10.00 1.00

goods sold are not returnable or exchangeable

thank you please come again !


In [9]:
vlm_data = tfms_multline(vlm_data)
print(vlm_data[0])

tan woon yann
book ta _k (taman daya) sdn bhd
789417-w
no.53 55,57 & 59, jalan sagu 18,

taman daya,

81100 johor bahru,

johor.

document no : td01167104
date : 25/12/2018 8:13:39 pm
cashier : manis
member :

cash bill
code/desc price disc amount rm
95569390-0118 kf modelling clay kiddy fish 9.000 0.00 9.00
total : 9.00
rounding adjustment : 0.00
round d total (rm) : 9.00
cash change 10.00 1.00
goods sold are not returnable or exchangeable
thank you please come again !


In [10]:
sroie_box_data = []

for file_path in sroie_box_train:
    data = open(file_path).readlines()

    final_content = ''
    for line in data:
        final_content += line.split(',')[8]
    
    sroie_box_data.append(final_content.lower())

In [11]:
sroie_box_data = tfms_multline(sroie_box_data)

In [12]:
print(sroie_box_data[0])

tan woon yann
book ta .k(taman daya) sdn bnd
789417-w
no.53 55taman daya81100 johor bahrujohor.
document no : td01167104
date:
25/12/2018 8:13:39 pm
cashier:
manis
member:
cash bill
code/desc
price
disc
amount
qty
rm
rm
9556939040116
kf modelling clay kiddy fish
1 pc
*
9.000
0.00
9.00
total:
rour ding adjustment:
0.00
round d total (rm):
9.00
cash
10.00
change
1.00
goods sold are not returnable or
exchangeable
***
***
thank you
please come again !
9.00



In [13]:
calculate_cer(sroie_box_data, vlm_data)

CER: 0.9248767311784117


## Calculate CER of Test Data

In [14]:
vlm_data = []

all_vlm_txt_test_paths = glob.glob('../input/qwen2_vl_7b_sroiev2_test_annots/*.txt')
all_vlm_txt_test_paths.sort()

for file_path in all_vlm_txt_test_paths:
    data = open(file_path).read()
    vlm_data.append(data.lower())

In [15]:
print(len(vlm_data))

347


In [16]:
print(vlm_data[0])

*** copy ***
ojc marketing sdn bhd
roc no: 538358-h
no 2 & 4, jalan bayu 4,
bandar seri alam,
81750 masai, johor
tel: 07-388 2218 fax: 07-388 8218
email: ng@ojcgroup.com

tax invoice
invoice no : pegiv-1030765
date : 15/01/2019 11:05:16 am
cashier : ng chuan min
sales persor : fatin
bill to : the peak quarry works
address : .

description qty price amount
0000000111 1 193.00 193.00 sr
kings safety shoes kwd b05
qty: 1 total exclude gst: 193.00
total gst @6%: 0.00
total inclusive gst: 193.00
round amt: 0.00
total: 193.00
visa card 193.00
xxxxxxxxxxxx4318 approval code:000
goods sold are not returnable & refundable
****thank you. please come again.****


In [17]:
vlm_data = tfms_multline(vlm_data)
print(vlm_data[0])

*** copy ***
ojc marketing sdn bhd
roc no: 538358-h
no 2 & 4, jalan bayu 4,
bandar seri alam,
81750 masai, johor
tel: 07-388 2218 fax: 07-388 8218
email: ng@ojcgroup.com
tax invoice
invoice no : pegiv-1030765
date : 15/01/2019 11:05:16 am
cashier : ng chuan min
sales persor : fatin
bill to : the peak quarry works
address : .

description qty price amount
0000000111 1 193.00 193.00 sr
kings safety shoes kwd b05
qty: 1 total exclude gst: 193.00
total gst @6%: 0.00
total inclusive gst: 193.00
round amt: 0.00
total: 193.00
visa card 193.00
xxxxxxxxxxxx4318 approval code:000
goods sold are not returnable & refundable
****thank you. please come again.****


In [18]:
sroie_box_data = []

for i, file_path in enumerate(sroie_box_test):
    # Try-except because there are a few emtpy files and an
    # UTF-8 encoding error in one of the files.
    try:
        data = open(file_path).readlines()
        final_content = ''
        for line in data:
            final_content += line.split(',')[8]
        
        sroie_box_data.append(final_content.lower())
    except:
        print(f"Erroneous index: {i}")

Erroneous index: 149
Erroneous index: 230


In [19]:
print(len(sroie_box_data))

345


In [20]:
# Pop the data from the same indices in the VLM generated data.
vlm_data.pop(149)
vlm_data.pop(230)

'kedai ubat & runcit hong ning sdn. bhd.\n\n(717833-p)\n\n(gst id no : 002006163456)\n\nno.8, jalan lang kuning,\n\nkepong baru,\n\n52100 kuala lumpur.\n\ntel: 03-6273 2163\nbill no : pos/268511\ndate : 24/12/16 2:28:22 pm\ncashier : admin\npayment : cash\nitem qty price amount\nyi jin plan @2.83\ng3 1 3.00 3.00 sr.\n\ntian qi @18.87\nst100 1 20.00 20.00 sr.\n\ngardenia cr-choclt @0.80\n955664132 1 0.83 0.83 sr.\n\nmassimo due-coffee @0.80\n955675553 1 0.83 0.83 sr.\n\nmassimo due-coffee @0.80\n955675553 1 0.83 0.83 sr.\n\ntotal amount : 24.10\nround adj. : 0.00\ngst summary amount tax\nsr. @ 6% 24.10 1.45\nthank you\nplease come again'

In [21]:
sroie_box_data = tfms_multline(sroie_box_data)

In [22]:
print(sroie_box_data[0])

tan chay yee
*** copy ***
ojc marketing sdn bhd
roc no: 538358-h
no 2 & 4bandar seri alam81750 masaitel:07-388 2218 fax:07-388 8218
email:ng@ojcgroup.com
tax invoice
invoice no
: pegiv-1030765
date
: 15/01/2019 11:05:16 am
cashier
: ng chuan min
sales person : fatin
bill to
: the peak quarry works
address
:.
description
qty
price
amount
000000111
1
193.00
193.00 sr
kings safety shoes kwd b05
qty: 1
total exclude gst:
193.00
total gst @6%:
0.00
total inclusive gst:
193.00
round amt:
0.00
total:
193.00
visa card
193.00
xxxxxxxxxxxx4318
approval code:000
goods sold are not returnable & refundable
****thank you. please come again.****



In [23]:
calculate_cer(sroie_box_data, vlm_data)

CER: 0.5980108118227985
