In [2]:
import os
import pandas as pd
import json

In [3]:
train_entities_path = 'train/entities'
test_entities_path = 'test/entities'

In [4]:
def load_entities(path):
    entities = []
    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
                data['file'] = file.replace('.txt', '.jpg')
                entities.append(data)
    return entities

In [None]:
train_entities = load_entities(train_entities_path)
test_entities = load_entities(test_entities_path)

df_train = pd.DataFrame(train_entities)
df_test = pd.DataFrame(test_entities)

In [5]:
print("Train Data:")
print(df_train.head())

Train Data:
                           company        date  \
0  DIGI TELECOMMUNICATIONS SDN BHD  13/10/2017   
1   GARDENIA BAKERIES (KL) SDN BHD  30/10/2017   
2               MR. D.I.Y. SDN BHD    24-11-17   
3               RESTORAN WAN SHENG  21-03-2018   
4                  ADVANCO COMPANY  17/01/2018   

                                             address    total  \
0  LOT LG 315, 1-UTAMA SHOPPING CENTRE, LEBUH BAN...   234.40   
1  LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...    62.60   
2  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...  RM 3.90   
3  NO.2, JALAN TEMENGGUNG 19/9, SEKSYEN 9, BANDAR...     6.70   
4  NO 1&3, JALAN WANGSA DELIMA 12, WANGSA LINK, W...    29.00   

               file  
0  X51006555072.jpg  
1  X51006557117.jpg  
2  X51005568884.jpg  
3  X51005711441.jpg  
4  X51005806685.jpg  


In [6]:
print("\nTest Data:")
print(df_test.head())


Test Data:
                            company        date  \
0       SYARIKAT PERNIAGAAN GIN KEE  25/01/2018   
1          HON HWA HARDWARE TRADING  05/01/2017   
2             OJC MARKETING SDN BHD  15/01/2019   
3    GARDENIA BAKERIES (KL) SDN BHD  27/10/2017   
4  GERBANG ALAF RESTAURANTS SDN BHD  24/06/2018   

                                             address   total              file  
0  NO 290, JALAN AIR PANAS, SETAPAK, 53200, KUALA...  190.80  X51005675104.jpg  
1  NO 37, JALAN MANIS 7, TAMAN SEGAR, 56100 CHERA...    5.00  X51005568890.jpg  
2  NO 2 & 4, JALAN BAYU 4, BANDAR SERI ALAM, B175...  193.00  X00016469670.jpg  
3  LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...   35.88  X51006557507.jpg  
4  LEVEL 6, BANGUNAN TH, DAMANSARA UPTOWN3 NO.3, ...    7.35  X51007846396.jpg  


In [7]:
def data_exploration(df):
    print(f"Total number of images: {len(df)}")
    print(f"Classes: {df.columns.tolist()}")

    for column in df.columns:
        if column != 'file':
            print(f"\nClass: {column}")
            print(df[column].value_counts().head(10))

In [8]:
print("\nTrain Data Exploration:")
data_exploration(df_train)


Train Data Exploration:
Total number of images: 626
Classes: ['company', 'date', 'address', 'total', 'file']

Class: company
GARDENIA BAKERIES (KL) SDN BHD    45
UNIHAKKA INTERNATIONAL SDN BHD    42
SANYU STATIONERY SHOP             36
99 SPEED MART S/B                 31
MR. D.I.Y. (M) SDN BHD            29
RESTORAN WAN SHENG                26
SYARIKAT PERNIAGAAN GIN KEE       22
AEON CO. (M) BHD                  15
POPULAR BOOK CO. (M) SDN BHD      12
KEDAI PAPAN YEW CHUAN             12
Name: company, dtype: int64

Class: date
20/03/2018    5
11/05/2018    5
27/03/2018    5
12/02/2018    4
04/12/2017    4
02/01/2018    4
20/04/2018    4
21/03/2018    3
14/04/2018    3
03/02/2018    3
Name: date, dtype: int64

Class: address
LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SELANGOR.                                               46
LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR    44
NO. 31G&33G, JALAN SETIA INDAH X ,U13/X 40170 SETIA ALAM

In [9]:
print("\nTest Data Exploration:")
data_exploration(df_test)


Test Data Exploration:
Total number of images: 347
Classes: ['company', 'date', 'address', 'total', 'file']

Class: company
GARDENIA BAKERIES (KL) SDN BHD      31
UNIHAKKA INTERNATIONAL SDN BHD      27
RESTORAN WAN SHENG                  14
99 SPEED MART S/B                   13
SANYU STATIONERY SHOP               13
MR. D.I.Y. (M) SDN BHD              12
SYARIKAT PERNIAGAAN GIN KEE         11
AEON CO. (M) BHD                     8
KEDAI PAPAN YEW CHUAN                7
GERBANG ALAF RESTAURANTS SDN BHD     7
Name: company, dtype: int64

Class: date
21/03/2018    4
01/02/2018    4
09/02/2018    3
03/06/2018    3
04/01/2018    3
21-03-2018    3
30/04/2017    3
13/01/2018    3
25/03/2018    3
14/03/2018    3
Name: date, dtype: int64

Class: address
LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SELANGOR.                                               29
12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAMPOI,81200 JOHOR BAHRU,JOHOR                           23
LOT 1851-A & 1851-B, JALAN KPB 6, KAW

In [11]:
import os
import paddleocr
from paddleocr import PaddleOCR, draw_ocr
import matplotlib.pyplot as plt
from PIL import Image

# Initialize the PaddleOCR model
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Define paths
train_img_path = 'train/img'
test_img_path = 'test/img'

# Function to perform OCR on an image
def extract_text(image_path):
    result = ocr.ocr(image_path, cls=True)
    return result

# Function to visualize the OCR results
def visualize_ocr(image_path, ocr_result):
    image = Image.open(image_path).convert('RGB')
    boxes = [line[0] for line in ocr_result]
    txts = [line[1][0] for line in ocr_result]
    scores = [line[1][1] for line in ocr_result]
    im_show = draw_ocr(image, boxes, txts, scores, font_path='path/to/font.ttf')
    im_show = Image.fromarray(im_show)
    plt.imshow(im_show)
    plt.show()

# Perform OCR on a sample train image
sample_image_path = os.path.join(train_img_path, 'X51006555072.jpg')
ocr_result = extract_text(sample_image_path)

# Print OCR result
for line in ocr_result:
    print(line)


[2024/07/08 22:18:55] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/shreshtagundoju/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/shreshtagundoju/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batc

[2024/07/08 22:18:57] ppocr DEBUG: dt_boxes num : 34, elapsed : 0.3725309371948242
[2024/07/08 22:18:57] ppocr DEBUG: cls num  : 34, elapsed : 0.2641141414642334
[2024/07/08 22:19:04] ppocr DEBUG: rec_res num  : 34, elapsed : 7.308068037033081
[[[[112.0, 181.0], [498.0, 181.0], [498.0, 202.0], [112.0, 202.0]], ('DIGI TELECOMMUNICATIONS SDN BHD', 0.9354791045188904)], [[[240.0, 206.0], [360.0, 206.0], [360.0, 227.0], [240.0, 227.0]], ('(201283-M', 0.9654237031936646)], [[[85.0, 227.0], [533.0, 230.0], [533.0, 255.0], [85.0, 252.0]], ('LOT LG 3151-UTAMA SHOPPINGCENTRE,', 0.917939305305481)], [[[97.0, 253.0], [498.0, 256.0], [497.0, 280.0], [97.0, 277.0]], ('LEBUH BANDAR UTAMA-BANDAR UTAMA,', 0.9297536611557007)], [[[174.0, 281.0], [426.0, 281.0], [426.0, 303.0], [174.0, 303.0]], ('Petaling Jaya47800', 0.929825484752655)], [[[251.0, 305.0], [350.0, 305.0], [350.0, 326.0], [251.0, 326.0]], ('SELANGOR', 0.9849645495414734)], [[[238.0, 356.0], [373.0, 356.0], [373.0, 377.0], [238.0, 377.0]],

In [None]:
# Visualize the OCR result
visualize_ocr(sample_image_path, ocr_result)