In [None]:
from pdf2image import convert_from_path
import easyocr
import numpy as np
import os

# Convert PDF to images
pdf_path = '/content/python_handwritten-9-29.pdf'
output_folder = 'extracted_images'
os.makedirs(output_folder, exist_ok=True)

images = convert_from_path(pdf_path)
for i, image in enumerate(images):
    image_path = f"{output_folder}/page_{i+1}.jpg"
    image.save(image_path, 'JPEG')

# Handwriting OCR using EasyOCR
reader = easyocr.Reader(['en'])  # English language
extracted_text = ""

for i, image in enumerate(images):
    # Convert PIL image to NumPy array
    image_np = np.array(image)

    # Perform OCR on the image
    result = reader.readtext(image_np)

    # Extract text
    text = "\n".join([item[1] for item in result])
    extracted_text += f"Page {i+1}:\n{text}\n\n"

# Save extracted text
with open("extracted_text.txt", "w") as file:
    file.write(extracted_text)

print("Text extraction completed!")


Text extraction completed!


In [None]:
from pdf2image import convert_from_path
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
import os

# Step 1: Convert PDF to Images
pdf_path = '/content/42557_BDA_1.pdf'
output_folder = 'extracted_images'
os.makedirs(output_folder, exist_ok=True)

images = convert_from_path(pdf_path)
for i, image in enumerate(images):
    image_path = f"{output_folder}/page_{i+1}.jpg"
    image.save(image_path, 'JPEG')

# Step 2: Load TrOCR Model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 3: Perform OCR
extracted_text = ""

for i, image in enumerate(images):
    image = image.convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # Generate text
    output_ids = model.generate(pixel_values)
    text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

    extracted_text += f"Page {i+1}:\n{text}\n\n"

# Step 4: Save Extracted Text
with open("trans_t.txt", "w") as file:
    file.write(extracted_text)

print("Text extraction completed using TrOCR!")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_

Text extraction completed using TrOCR!


In [None]:
!pip install transformers



In [None]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 30 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (204 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126210 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!pip install pdf2image easyocr

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->easyocr)
  D

In [None]:
r"""
conda env remove --name trOCR
conda env create --name trOCR --file environment.yml

cache folder
C:\Users\techexpert\.cache\huggingface\hub

nvidia-smi for GPU info

cd scripts
python trOCR.py
"""

from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import os
import requests
# from my_timer import my_timer


def run_trOCR(model_name="microsoft/trocr-base-printed", images=""):
    """
    There are 3 main models to choose from, small, base and large.
    Some other fine-tuned models: IAM Handwritten, SROIE Receipts
    """
    processor = TrOCRProcessor.from_pretrained(model_name)
    model = VisionEncoderDecoderModel.from_pretrained(model_name)

    # Check for GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"running on {device}")
    model.to(device)  # Move model to GPU
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values, max_new_tokens=1000)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)


if __name__ == "__main__":
    model_id = "microsoft/trocr-large-handwritten" # indus tre, This is a sample of text

    image = Image.open("/content/image.png").convert("RGB")
    run_trOCR(model_id, image)

    # image = Image.open("handwriting_oneline.jpg").convert("RGB")
    # run_trOCR(model_id, image)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_

running on cuda
1907 08


In [None]:
! pip install -q paddlepaddle

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[0m

In [None]:
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import os

# Step 1: Convert PDF to Images
pdf_path = '/content/42557_BDA_1.pdf'
output_folder = 'extracted_images'
os.makedirs(output_folder, exist_ok=True)

images = convert_from_path(pdf_path)
for i, image in enumerate(images):
    image_path = f"{output_folder}/page_{i+1}.jpg"
    image.save(image_path, 'JPEG')

# Step 2: OCR with PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Enable angle detection for rotated text
extracted_text = ""

for i, image in enumerate(images):
    image_path = f"{output_folder}/page_{i+1}.jpg"
    result = ocr.ocr(image_path)

    # Extract text from result
    page_text = "\n".join([line[1][0] for line in result[0]])
    extracted_text += f"Page {i+1}:\n{page_text}\n\n"

# Step 3: Save Extracted Text
with open("paddleocr_extracted_text.txt", "w") as file:
    file.write(extracted_text)

print("Text extraction completed using PaddleOCR!")




download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:16<00:00, 231.39it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:17<00:00, 568.04it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:15<00:00, 141.21it/s]

[2025/04/03 17:40:54] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_l




[2025/04/03 17:40:57] ppocr DEBUG: dt_boxes num : 31, elapsed : 0.2950916290283203
[2025/04/03 17:40:58] ppocr DEBUG: cls num  : 31, elapsed : 0.12031435966491699
[2025/04/03 17:41:02] ppocr DEBUG: rec_res num  : 31, elapsed : 4.611916542053223
[2025/04/03 17:41:02] ppocr DEBUG: dt_boxes num : 36, elapsed : 0.2313833236694336
[2025/04/03 17:41:03] ppocr DEBUG: cls num  : 36, elapsed : 0.06926894187927246
[2025/04/03 17:41:07] ppocr DEBUG: rec_res num  : 36, elapsed : 4.6275787353515625
[2025/04/03 17:41:07] ppocr DEBUG: dt_boxes num : 38, elapsed : 0.22621703147888184
[2025/04/03 17:41:08] ppocr DEBUG: cls num  : 38, elapsed : 0.16802000999450684
[2025/04/03 17:41:11] ppocr DEBUG: rec_res num  : 38, elapsed : 2.914079427719116
Text extraction completed using PaddleOCR!
