**Installing required libraries and keras_ocr**

In [None]:
!pip install --force-reinstall -v "tensorflow==2.15.1"
from IPython.display import clear_output
clear_output()

In [None]:
!pip install keras-ocr
clear_output()

In [None]:
import keras_ocr
import matplotlib.pyplot as plt

Pipeline for OCR

In [None]:
pipeline = keras_ocr.pipeline.Pipeline()

Looking for /root/.keras-ocr/craft_mlt_25k.h5
Downloading /root/.keras-ocr/craft_mlt_25k.h5


Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.


Looking for /root/.keras-ocr/crnn_kurapan.h5
Downloading /root/.keras-ocr/crnn_kurapan.h5


Mounting drive for accessing dataset

In [None]:
from google.colab import drive
import os
import keras_ocr
from PIL import Image
import numpy as np


# drive.mount('/content/drive')

Mounted at /content/drive


Dataset collection

In [None]:
german_image = '/content/german_images'
english_image = '/content/english_images'


german_image_paths = [os.path.join(german_image, f) for f in os.listdir(german_image) if f.lower().endswith(('png', 'jpg', 'jpeg'))]
english_image_paths = [os.path.join(english_image, f) for f in os.listdir(english_image) if f.lower().endswith(('png', 'jpg', 'jpeg'))]

In [None]:
german_image_paths

['/content/german_images/images (3).jpeg',
 '/content/german_images/sddefault.jpg',
 '/content/german_images/18.jpg',
 '/content/german_images/a6ceca7de0b109cbde662a7aebe1e8ba.jpg',
 '/content/german_images/images (5).jpeg',
 '/content/german_images/download (3).png',
 '/content/german_images/istockphoto-842944206-612x612.jpg',
 '/content/german_images/1711551786phpLzvlR0.jpeg',
 '/content/german_images/images (7).jpeg',
 '/content/german_images/images (11).jpeg',
 '/content/german_images/images (9).jpeg',
 '/content/german_images/download (4).png',
 '/content/german_images/download.png',
 '/content/german_images/sddefault (1).jpg',
 '/content/german_images/maxresdefault.jpg',
 '/content/german_images/6syghe68isg01.jpg',
 '/content/german_images/images (6).jpeg',
 '/content/german_images/download (1).png',
 '/content/german_images/259-large.jpg',
 '/content/german_images/cbse-class-10-sample-paper-2023-24-german-img.jpg',
 '/content/german_images/image.png',
 '/content/german_images/im

If want to access image using URL

In [None]:
# urls =  [
#         'https://fiverr-res.cloudinary.com/images/q_auto,f_auto/gigs/207141041/original/a6894a4cd1db996ad97ba1b8347d986975a12a32/proof-read-your-german-text.png',
#         'https://i.pinimg.com/originals/9b/35/ff/9b35ffaeb4de11f88fbf6b2fe860fb12.jpg',
#         'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTfA3pKDpplSaJwgdx6g35VIj7poJCJgJT_nzQuMo3fSKcKmXeUiH9Ri485szayTjI6c0E&usqp=CAU'
#         ]

# import requests
# indx = 0
# for url in urls:
#     indx=indx+1
#     try:
#         response = requests.get(url)
#         response.raise_for_status()
#         print(f"URL number : {indx} is accessible.")
#     except requests.exceptions.RequestException as e:
#         print(f"Error accessing URL number: {indx}")

In [None]:
prediction_groups = pipeline.recognize(german_image_paths)

In [None]:
prediction_groups_eng = pipeline.recognize(english_image_paths)

In [None]:
# images = [
#     keras_ocr.tools.read(url) for url in urls
# ]
# predicted_groups = pipeline.recognize(images)

In [None]:
german_image_arrays = []
for image_path in german_image_paths:
    img = Image.open(image_path)
    img_array = np.array(img)
    german_image_arrays.append(img_array)
english_image_arrays = []
for image_path in english_image_paths:
    img = Image.open(image_path)
    img_array = np.array(img)
    english_image_arrays.append(img_array)

Visualizing german extracted text

In [None]:
fig, axs = plt.subplots(nrows=len(german_image_arrays), figsize=(10, 8))
for ax, image, predictions in zip(axs, german_image_arrays, prediction_groups):
    keras_ocr.tools.drawAnnotations(image=image, predictions=predictions, ax=ax)

Visualizing english extracted text

In [None]:
fig, axs = plt.subplots(nrows=len(english_image_arrays), figsize=(10, 8))
for ax, image, predictions in zip(axs, english_image_arrays, prediction_groups_eng):
    keras_ocr.tools.drawAnnotations(image=image, predictions=predictions, ax=ax)

Sorting via boxes indexes to get the more optimum order of words

In [None]:
def extract_paragraphs(predicted_groups):
    paragraphs = []
    for predicted_image in predicted_groups:
        predictions_sorted = sorted(predicted_image, key=lambda x: (min(x[1], key=lambda p: p[1])[1], min(x[1], key=lambda p: p[0])[0]))
        paragraph = ""
        for text, box in predicted_image:
            paragraph += text + " "

        paragraph = paragraph.strip()
        paragraphs.append(paragraph)

    return paragraphs


german_paragraphs = extract_paragraphs(prediction_groups)
english_paragraphs = extract_paragraphs(prediction_groups_eng)


In [None]:
print(len(german_paragraphs))
print(len(english_paragraphs))

In [None]:
for paragraph in german_paragraphs:
  print(paragraph)

In [None]:
for paragraph in english_paragraphs:
  print(paragraph)

In [None]:
!pip install transformers

In [None]:
!pip install sacremoses

Helsinki-NLP/opus-mt-de-en Model for german to english translation

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


Testing the model with test_sentence

In [None]:
test_sentence = "Berlin besitzt neben ausgedehnten Waldgebieten im Westen und Südosten des Stadtgebietes viele große Parkanlagen."
inputs = tokenizer.encode(test_sentence, return_tensors="pt", truncation=True)
outputs = model.generate(inputs)
english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(english_translation)

Testing on extracted german text

In [None]:
inputs = tokenizer.encode(german_paragraphs[0], return_tensors="pt", truncation=True)
outputs = model.generate(inputs)
english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(english_translation)

Corresponding English text extracted using keras_ocr

In [None]:
print(english_paragraphs[8])

In [None]:
import pandas as pd
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq


In [None]:
# Step 1: Load the Dataset
dataset = load_dataset("json", data_files="your_dataset.json")

# Splitting dataset into train and validation sets
train_test_split = dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]


In [None]:
# Step 2: Load Pre-Trained Tokenizer
# Using the Helsinki-NLP pre-trained German-English translation model (for example)
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the Dataset
def preprocess_function(examples):
    return tokenizer(examples["source"], text_target=examples["target"], truncation=True)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)



In [None]:
# Step 3: Load the Pre-Trained Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
# Step 4: Define the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
# Step 5: Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./translation_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500
)


In [None]:
# Step 6: Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
# Step 7: Train the Model
trainer.train()


In [None]:
# Step 8: Save the Fine-Tuned Model
trainer.save_model("./fine_tuned_translation_model")
tokenizer.save_pretrained("./fine_tuned_translation_model")


In [None]:
print("Model fine-tuning complete and saved at './fine_tuned_translation_model'")