In [1]:
import pandas as pd
import os
from tqdm import tqdm
from paddleocr import PaddleOCR
from PIL import Image
import requests
from io import BytesIO
import numpy as np

# Enable tqdm with pandas
tqdm.pandas()

# Initialize the PaddleOCR model outside the loop
ocr = PaddleOCR(
    use_angle_cls=True, 
    lang='en', 
    use_gpu=True
)

def ocr_from_image_url(image_url):
    try:
        # Download the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Check if the request was successful

        # Open the image using PIL
        image = Image.open(BytesIO(response.content))

        # Convert the image to a format compatible with PaddleOCR (NumPy array)
        image_np = np.array(image)

        # Perform OCR using PaddleOCR
        result = ocr.ocr(image_np)

        # Extract and concatenate text from the OCR result
        text = '\n'.join([line[1][0] for line in result[0]])
        return text

    except requests.exceptions.RequestException as e:
        return f"Error downloading image: {e}"
    except Exception as e:
        return f"Error processing image: {e}"

# Load the dataset
dataframe = pd.read_csv('../dataset/test.csv')

# Output file path
output_file = '../dataset/test_ocr.csv'

# File to store the last processed index
checkpoint_file = '../dataset/test_last_processed_index.txt'

# Read the last processed index from the checkpoint file if it exists
start_index = 0
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        start_index = int(f.read().strip())

# Initialize the output CSV file with headers if it does not exist
if not os.path.exists(output_file):
    dataframe['paddleocr_output'] = ""  # Add a placeholder column for OCR output
    dataframe.to_csv(output_file, index=False)

# Process each image link and append results to the CSV file
for index, row in tqdm(dataframe.iloc[start_index:].iterrows(), total=len(dataframe) - start_index):
    image_url = row['image_link']
    ocr_output = ocr_from_image_url(image_url)
    
    # Create a DataFrame for the single row with the OCR output
    result_df = pd.DataFrame([row.to_dict()])  # Convert row to dictionary and wrap it in a DataFrame
    result_df['paddleocr_output'] = ocr_output  # Add the OCR output column

    # Append the row to the CSV file
    result_df.to_csv(output_file, mode='a', header=False, index=False)
    
    # Save the current index to the checkpoint file
    with open(checkpoint_file, 'w') as f:
        f.write(str(index))

print("Processing completed.")


[2024/09/15 05:15:24] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/sadnyd/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/sadnyd/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mo

  0%|          | 0/131187 [00:00<?, ?it/s]

[2024/09/15 05:15:26] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.13692259788513184
[2024/09/15 05:15:26] ppocr DEBUG: cls num  : 4, elapsed : 0.012123823165893555
[2024/09/15 05:15:26] ppocr DEBUG: rec_res num  : 4, elapsed : 0.1336050033569336


  0%|          | 1/131187 [00:00<19:18:05,  1.89it/s]

[2024/09/15 05:15:26] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.09101581573486328
[2024/09/15 05:15:26] ppocr DEBUG: cls num  : 6, elapsed : 0.023567676544189453
[2024/09/15 05:15:27] ppocr DEBUG: rec_res num  : 6, elapsed : 0.4910616874694824


  0%|          | 2/131187 [00:01<24:59:34,  1.46it/s]

[2024/09/15 05:15:27] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.0206143856048584
[2024/09/15 05:15:27] ppocr DEBUG: cls num  : 6, elapsed : 0.012270689010620117
[2024/09/15 05:15:27] ppocr DEBUG: rec_res num  : 6, elapsed : 0.2031702995300293


  0%|          | 3/131187 [00:01<20:19:23,  1.79it/s]

[2024/09/15 05:15:27] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.019366741180419922
[2024/09/15 05:15:27] ppocr DEBUG: cls num  : 6, elapsed : 0.00997018814086914
[2024/09/15 05:15:27] ppocr DEBUG: rec_res num  : 6, elapsed : 0.27202272415161133


  0%|          | 4/131187 [00:02<18:48:04,  1.94it/s]

[2024/09/15 05:15:28] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.01946401596069336
[2024/09/15 05:15:28] ppocr DEBUG: cls num  : 6, elapsed : 0.009921550750732422
[2024/09/15 05:15:28] ppocr DEBUG: rec_res num  : 6, elapsed : 0.2576591968536377


  0%|          | 5/131187 [00:02<17:49:24,  2.04it/s]

[2024/09/15 05:15:28] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.020351886749267578
[2024/09/15 05:15:28] ppocr DEBUG: cls num  : 6, elapsed : 0.009547710418701172
[2024/09/15 05:15:28] ppocr DEBUG: rec_res num  : 6, elapsed : 0.20656394958496094


  0%|          | 6/131187 [00:03<16:33:36,  2.20it/s]

[2024/09/15 05:15:28] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.019449472427368164
[2024/09/15 05:15:28] ppocr DEBUG: cls num  : 6, elapsed : 0.009478330612182617
[2024/09/15 05:15:29] ppocr DEBUG: rec_res num  : 6, elapsed : 0.25672388076782227


  0%|          | 7/131187 [00:03<16:20:07,  2.23it/s]

[2024/09/15 05:15:29] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.10107135772705078
[2024/09/15 05:15:29] ppocr DEBUG: cls num  : 3, elapsed : 0.03205227851867676
[2024/09/15 05:15:29] ppocr DEBUG: rec_res num  : 3, elapsed : 0.1536710262298584


  0%|          | 8/131187 [00:03<16:20:50,  2.23it/s]

[2024/09/15 05:15:32] ppocr DEBUG: dt_boxes num : 3, elapsed : 2.463170051574707
[2024/09/15 05:15:33] ppocr DEBUG: cls num  : 3, elapsed : 0.41536855697631836
[2024/09/15 05:15:35] ppocr DEBUG: rec_res num  : 3, elapsed : 2.0007522106170654


  0%|          | 9/131187 [00:09<78:56:13,  2.17s/it]

[2024/09/15 05:15:49] ppocr DEBUG: dt_boxes num : 3, elapsed : 13.939943552017212
[2024/09/15 05:15:50] ppocr DEBUG: cls num  : 3, elapsed : 0.30678677558898926
[2024/09/15 05:15:50] ppocr DEBUG: rec_res num  : 3, elapsed : 0.4347085952758789


  0%|          | 10/131187 [00:25<226:22:39,  6.21s/it]

[2024/09/15 05:15:51] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.47978973388671875
[2024/09/15 05:15:51] ppocr DEBUG: cls num  : 6, elapsed : 0.2550065517425537
[2024/09/15 05:15:52] ppocr DEBUG: rec_res num  : 6, elapsed : 0.5189225673675537


  0%|          | 11/131187 [00:26<174:46:16,  4.80s/it]

[2024/09/15 05:15:53] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.4459662437438965
[2024/09/15 05:15:53] ppocr DEBUG: cls num  : 3, elapsed : 0.4220306873321533
[2024/09/15 05:15:53] ppocr DEBUG: rec_res num  : 3, elapsed : 0.44797706604003906


  0%|          | 12/131187 [00:28<139:45:06,  3.84s/it]

[2024/09/15 05:15:54] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.34520769119262695
[2024/09/15 05:15:55] ppocr DEBUG: cls num  : 2, elapsed : 0.43340468406677246
[2024/09/15 05:15:56] ppocr DEBUG: rec_res num  : 2, elapsed : 1.1642775535583496


  0%|          | 13/131187 [00:30<123:01:15,  3.38s/it]

[2024/09/15 05:15:56] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.3513205051422119
[2024/09/15 05:15:57] ppocr DEBUG: cls num  : 2, elapsed : 0.35610175132751465
[2024/09/15 05:15:57] ppocr DEBUG: rec_res num  : 2, elapsed : 0.4057455062866211


  0%|          | 14/131187 [00:32<102:25:15,  2.81s/it]

[2024/09/15 05:15:58] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.4261462688446045
[2024/09/15 05:15:58] ppocr DEBUG: cls num  : 2, elapsed : 0.35069799423217773
[2024/09/15 05:15:59] ppocr DEBUG: rec_res num  : 2, elapsed : 0.5340771675109863


  0%|          | 15/131187 [00:33<90:34:02,  2.49s/it] 

[2024/09/15 05:16:00] ppocr DEBUG: dt_boxes num : 0, elapsed : 0.954986572265625
[2024/09/15 05:16:00] ppocr DEBUG: cls num  : 0, elapsed : 0
[2024/09/15 05:16:00] ppocr DEBUG: rec_res num  : 0, elapsed : 3.5762786865234375e-06


  0%|          | 16/131187 [00:35<78:33:05,  2.16s/it]

[2024/09/15 05:16:03] ppocr DEBUG: dt_boxes num : 0, elapsed : 1.7394587993621826
[2024/09/15 05:16:03] ppocr DEBUG: cls num  : 0, elapsed : 0
[2024/09/15 05:16:03] ppocr DEBUG: rec_res num  : 0, elapsed : 8.344650268554688e-06


  0%|          | 17/131187 [00:37<81:17:26,  2.23s/it]

[2024/09/15 05:16:04] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.6357040405273438
[2024/09/15 05:16:05] ppocr DEBUG: cls num  : 15, elapsed : 0.9401195049285889
[2024/09/15 05:16:09] ppocr DEBUG: rec_res num  : 15, elapsed : 4.06330418586731


  0%|          | 18/131187 [00:43<124:46:34,  3.42s/it]

[2024/09/15 05:16:10] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.41402530670166016
[2024/09/15 05:16:11] ppocr DEBUG: cls num  : 15, elapsed : 1.5419538021087646
[2024/09/15 05:16:14] ppocr DEBUG: rec_res num  : 15, elapsed : 2.4605798721313477


  0%|          | 19/131187 [00:48<141:00:17,  3.87s/it]

[2024/09/15 05:16:15] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.39212727546691895
[2024/09/15 05:16:16] ppocr DEBUG: cls num  : 15, elapsed : 0.8407902717590332
[2024/09/15 05:16:18] ppocr DEBUG: rec_res num  : 15, elapsed : 2.2774758338928223


  0%|          | 20/131187 [00:52<143:18:47,  3.93s/it]

[2024/09/15 05:16:24] ppocr DEBUG: dt_boxes num : 3, elapsed : 5.296145915985107
[2024/09/15 05:16:25] ppocr DEBUG: cls num  : 3, elapsed : 0.8421471118927002
[2024/09/15 05:16:26] ppocr DEBUG: rec_res num  : 3, elapsed : 1.658698558807373


  0%|          | 21/131187 [01:01<191:52:00,  5.27s/it]

[2024/09/15 05:16:34] ppocr DEBUG: dt_boxes num : 1, elapsed : 6.787706136703491
[2024/09/15 05:16:38] ppocr DEBUG: cls num  : 1, elapsed : 4.687837600708008
[2024/09/15 05:16:42] ppocr DEBUG: rec_res num  : 1, elapsed : 3.4150640964508057


  0%|          | 22/131187 [01:16<303:26:51,  8.33s/it]

[2024/09/15 05:16:54] ppocr DEBUG: dt_boxes num : 10, elapsed : 11.524568796157837
[2024/09/15 05:17:37] ppocr DEBUG: cls num  : 10, elapsed : 42.74043798446655
[2024/09/15 05:17:57] ppocr DEBUG: rec_res num  : 10, elapsed : 20.569838047027588


  0%|          | 23/131187 [02:32<1042:52:53, 28.62s/it]

[2024/09/15 05:18:00] ppocr DEBUG: dt_boxes num : 2, elapsed : 1.4107739925384521
[2024/09/15 05:18:16] ppocr DEBUG: cls num  : 2, elapsed : 16.30612349510193
[2024/09/15 05:18:59] ppocr DEBUG: rec_res num  : 2, elapsed : 42.28260254859924


  0%|          | 24/131187 [03:34<1406:14:56, 38.60s/it]

[2024/09/15 05:19:38] ppocr DEBUG: dt_boxes num : 2, elapsed : 37.20241117477417
[2024/09/15 05:19:50] ppocr DEBUG: cls num  : 2, elapsed : 11.878809928894043
