In [19]:
import cv2
import pytesseract

# Optional: Set path to tesseract executable (for Windows)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


def preprocess_image(img_path):
    image = cv2.imread(img_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Binarization - makes text more visible
    thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    return thresh

def extract_handwritten_text(img_path):
    preprocessed_img = preprocess_image(img_path)
    text = pytesseract.image_to_string(preprocessed_img, config='--psm 6')
    return text

# Example usage
# img_file = 'data/handwritten_sample.jpeg'
# img_file = 'data/handwritten_sample1.jpg'
img_file = 'data/text_sample.png'
extracted_text = extract_handwritten_text(img_file)
print("Extracted Text:\n", extracted_text)


Extracted Text:
 It was the best of
times, it was the worst
of times, it was the age
of wisdom, it was the
age of foolishness...



In [4]:
!pip install transformers einops timm pillow

Collecting einops
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting timm
  Downloading timm-1.0.17-py3-none-any.whl.metadata (59 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Downloading timm-1.0.17-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.5 MB 2.2 MB/s eta 0:00:01
   --------------------------------- ------ 2.1/2.5 MB 4.9 MB/s eta 0:00:01
   ---------------------------------------- 2.5/2.5 MB 4.3 MB/s eta 0:00:00
Installing collected packages: einops, timm

   ---------------------------------------- 0/2 [einops]
   -------------------- ------------------- 1/2 [timm]
   -------------------- ------------------- 1/2 [timm]
   -------------------- ------------------- 1/2 [timm]
   -------------------- ------------------- 1/2 [timm]
   -------------------- ------------------- 1/2 [timm]
  

In [5]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoModel
from PIL import Image
import torch
import cv2


# Initialize the model
model = AutoModel.from_pretrained('jinaai/jina-clip-v2', trust_remote_code=True)

# Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
truncate_dim = 512

all_images = ['https://i.ibb.co/nQNGqL0/beach1.jpg', 'https://i.ibb.co/r5w8hG8/beach2.jpg']
image_embeddings = model.encode_image(
    all_images[:2], truncate_dim=truncate_dim
)

# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

def extract_handwritten_text_trocr(image_path):
    image = Image.open(image_path).convert("RGB")

    # Preprocess and generate
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)

    # Decode output
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

# Example usage
img_path = "data/handwritten_sample.jpeg"
img = cv2.imread(img_path)
inverted_image = cv2.bitwise_not(img) # use loaded memeory, not path
cv2.imwrite("temp/handwritten_sample_inverted.jpeg", inverted_image)


inverted_image_path = "data/inverted_handwritten_sample.jpeg"
text = extract_handwritten_text_trocr(img_path)
print("Extracted Text:\n", text)


rope_embeddings.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- rope_embeddings.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


hf_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- hf_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


eva_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- eva_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- transform.py
- rope_embeddings.py
- hf_model.py
- eva_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.73G [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


configuration_xlm_roberta.py: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py: 0.00B [00:00, ?B/s]

modeling_xlm_roberta.py: 0.00B [00:00, ?B/s]

block.py: 0.00B [00:00, ?B/s]

stochastic_depth.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py: 0.00B [00:00, ?B/s]

rotary.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
- mlp.py
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- block.py
- embedding.py
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

processing_clip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- processing_clip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Text:
 " The Burden of the American Revolution


In [None]:
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Initialize TrOCR components
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

def upscale_image(img, scale=2):
    return cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

def invert_image(img):
    return cv2.bitwise_not(img)

def sharpen_image(img):
    # Simple sharpening kernel
    kernel = np.array([[0, -1, 0],
                       [-1, 5,-1],
                       [0, -1, 0]])
    return cv2.filter2D(img, -1, kernel)

def clean_image_pipeline(img_path, output_path="temp/cleaned_image.jpeg"):
    # Load image
    img = cv2.imread(img_path)

    # Apply cleaning steps
    img = upscale_image(img)
    img = invert_image(img)
    img = sharpen_image(img)

    # Save cleaned image
    cv2.imwrite(output_path, img)
    return output_path, img

def extract_handwritten_text_trocr_from_cv2img(img):
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    pixel_values = processor(images=pil_img, return_tensors="pt").pixel_values
    generated_ids = ocr_model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

# ---- Example usage ---- #
if __name__ == "__main__":
    cleaned_path, cleaned_img = clean_image_pipeline("data/handwritten_sample.jpeg")
    text = extract_handwritten_text_trocr_from_cv2img(cleaned_img)
    print("Extracted Text:\n", text)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Text:
 1961 62
