<H4># GPU Config for local workstation<H4>

In [2]:
import tensorflow as tf### models
import numpy as np### math computations
print('Tensorflow => ',tf.__version__)
print('Keras      => ',tf.keras.__version__)
print('Numpy      => ',np.__version__)
gpus = tf.config.list_physical_devices('GPU')
gpus


Tensorflow =>  2.10.0
Keras      =>  2.10.0
Numpy      =>  1.25.2


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
tf_device='/gpu:1'

<H4># Yolo Pre-processing to detect bounding boxes for handwritten texts from dataset<H4>

In [3]:
!pip install ultralytics




[notice] A new release of pip is available: 24.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import os
from ultralytics import YOLO
import cv2
from tqdm.notebook import tqdm as notebook_tqdm

In [5]:
# --- CONFIGURATION ---
MODEL_PATH = 'best_yolov8s.pt' # Use a trained handwritten region detection model if available or use the default one given by Challenge organizers
IMAGE_DIR = 'D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750'
RESULT_DIR = 'results6'
LABEL_DIR = os.path.join(RESULT_DIR, 'labels')
CONFIDENCE_THRESHOLD = 0.25

In [6]:
# --- PREPARE DIRECTORIES ---
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(LABEL_DIR, exist_ok=True)

In [7]:
# --- LOAD MODEL ---
model = YOLO(MODEL_PATH)

In [93]:
# --- PROCESS IMAGES ---
for filename in notebook_tqdm(os.listdir(IMAGE_DIR), desc="Processing images"):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(IMAGE_DIR, filename)
        print(f"Processing: {image_path}")
        # Run inference
        results = model(image_path, conf=CONFIDENCE_THRESHOLD)
        for result in results:
            # Sort bounding boxes: top-to-bottom, then left-to-right
            # Each box: box.xyxy[0] = [x1, y1, x2, y2]
            boxes_with_idx = [
                (idx, box, int(box.xyxy[0][0]), int(box.xyxy[0][1]))
                for idx, box in enumerate(result.boxes)
            ]
            # Sort by y1 (top), then x1 (left)
            boxes_with_idx_sorted = sorted(boxes_with_idx, key=lambda x: (x[3], x[2]))
            # Save annotated image
            annotated_img = result.plot()
            output_img_path = os.path.join(RESULT_DIR, filename)
            cv2.imwrite(output_img_path, annotated_img)
            # Save labels with bbox index (sorted)
            label_path = os.path.join(LABEL_DIR, filename.rsplit('.', 1)[0] + '.txt')
            with open(label_path, 'w') as f:
                for new_idx, (orig_idx, box, _, _) in enumerate(boxes_with_idx_sorted):
                    cls_id = int(box.cls[0])
                    conf = float(box.conf[0])
                    x1, y1, x2, y2 = map(int, box.xyxy[0]) # Bounding box
                    # YOLO format: cls_id conf x1 y1 x2 y2
                    # Add index and Write custom YOLO format: index cls_id conf x1 y1 x2 y2
                    f.write(f'{new_idx+1} {cls_id} {conf:.2f} {x1} {y1} {x2} {y2}\n')
            print(f"Saved image to: {output_img_path}")
            print(f"Saved labels to: {label_path}")

Processing images:   0%|          | 0/1500 [00:00<?, ?it/s]

Processing: D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_1.jpg

image 1/1 D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_1.jpg: 640x480 21 handwrittens, 154.1ms
Speed: 2.0ms preprocess, 154.1ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)
Saved image to: results6\MIT_1.jpg
Saved labels to: results6\labels\MIT_1.txt
Processing: D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_10.jpg

image 1/1 D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_10.jpg: 640x480 24 handwrittens, 100.5ms
Speed: 2.2ms preprocess, 100.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)
Saved image to: results6\MIT_10.jpg
Saved labels to: results6\labels\MIT_10.txt
Processing: D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_100.jpg

image 1/1 D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750\MIT_100.jpg: 640x480 20 handwrittens, 101.3ms
Speed: 2.9ms preprocess, 

<H4># DATA PREPROCESSING<H4>


In [None]:
# uncomment if running in Colab
'''
LABELS_DIR = '/content/drive/MyDrive/DeHaDo-AI/results/labels'
IMAGES_DIR = '/content/drive/MyDrive/DeHaDo-AI/DEHADO-AI_TRAINING_DATASET/IMAGES_750'
OCR_TXT_DIR = '/content/drive/MyDrive/DeHaDo-AI/DEHADO-AI_TRAINING_DATASET/LABELS_750'
OUTPUT_DIR = '/content/drive/MyDrive/DeHaDo-AI/results/output'
'''

In [8]:
# Uncomment if running in a local environment
LABELS_DIR = os.path.join(RESULT_DIR, 'labels')
OUTPUT_DIR = os.path.join(RESULT_DIR, 'output')
SKIPPED_FILES_LIST = os.path.join(RESULT_DIR, 'output\skipped_files.txt')
IMAGES_DIR = r'D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750'
OCR_TXT_DIR = r'D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\LABELS_750'

print('LABELS_DIR => ',LABELS_DIR)
print('OUTPUT_DIR => ',OUTPUT_DIR)
print('SKIPPED_FILES_LIST => ',SKIPPED_FILES_LIST)

LABELS_DIR =>  results6\labels
OUTPUT_DIR =>  results6\output
SKIPPED_FILES_LIST =>  results6\output\skipped_files.txt


In [9]:

os.makedirs(LABELS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
if not os.path.exists(SKIPPED_FILES_LIST):
    with open(SKIPPED_FILES_LIST, 'w'   ) as f:
        f.write('skipped_files.txt\n')

In [10]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import cv2
import json
from tqdm.notebook import tqdm as notebook_tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
def preprocess_img(img_path, bbox, img_height, img_width):
    """
    Crop image to bbox, resize and normalize for OCR model.
    """
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # Ensure bbox coordinates are within image bounds
    h, w = img.shape[:2]
    x1, y1, x2, y2 = map(int, bbox)
    x1, y1, x2, y2 = max(0, x1), max(0, y1), min(w, x2), min(h, y2)
    # Handle invalid bounding boxes
    if x1 >= x2 or y1 >= y2:
        return None   # return None to indicate a skipped crop

    cropped = img[y1:y2, x1:x2]
    resized = cv2.resize(cropped, (img_width, img_height))
    normed = resized.astype(np.float32) / 255.0
    normed = np.expand_dims(normed, axis=-1)
    return normed

def read_label_file(label_path):
    """
    Read YOLO label file and return list of bounding boxes.
    """
    bboxes = []
    if not os.path.exists(label_path):
        return bboxes # Return empty list if file doesn't exist
    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            # custom YOLO format: index cls_id conf x1 y1 x2 y2
            if len(parts) >= 7:
                try:
                    # parts[0] is the index 
                    # parts[1] is the class id 
                    # parts[2] is the confidence score                    
                    # parts 3-6 are the int bounding box coordinates
                    idx = int(parts[0])                    
                    x1, y1, x2, y2 = map(int, parts[3:7])
                    bboxes.append((idx, (x1, y1, x2, y2)))
                except ValueError:
                    continue # Skip this line if conversion fails
    return bboxes

def read_gt_text(txt_path):
    """
    Read ground truth JSON file and return list of text values.
    """
    result = []
    if not os.path.exists(txt_path):
        return result # Return empty list if file doesn't exist
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # Assuming `data` is a list where each element is an object containing 'Field value'            
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict) and 'Field value' in item:
                        value = item['Field value']
                        if isinstance(value, str):
                            result.append(value.strip())
                        elif isinstance(value, list) and value: # Handle lists of strings
                            result.append(' '.join(map(str, value)).strip()) # Join elements if it's a list
                        else:
                            result.append('') # Append empty string for unexpected formats
                    else:
                        print(f"Warning: Skipping unexpected item format in JSON file: {item}")                        
                        pass
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {txt_path}")
    except Exception as e:
        print(f"Error reading ground truth text from {txt_path}: {e}")

    return result # Return a list of strings


In [12]:
# BUILD ALPHABET FROM DATASET LABELS
all_chars = set()
for img_file in os.listdir(IMAGES_DIR):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    base_filename = os.path.splitext(img_file)[0]
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    gt_texts = read_gt_text(gt_txt_path)
    for text in gt_texts:
        all_chars.update(text)
alphabet = ''.join(sorted(list(all_chars)))
num_chars = len(alphabet) + 1  # +1 for CTC blank

print(f"Alphabet: {alphabet}")
print(f"Number of characters: {num_chars}")

Alphabet:  &'()+,-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz–’
Number of characters: 76


<H4># TRAINING CONFIG<H4>

In [13]:
# MODEL_TRAIN_FROM_SCRATCH = True # Uncomment if you want to train the model
MODEL_TRAIN_FROM_SCRATCH = False # Uncomment if you have a trained model
TRAINED_MODEL_FILE_NAME = 'ocr_model_trained3.h5'
PRED_MODEL_FILE_NAME = 'ocr_model_pred3.h5'

In [14]:
BATCH_SIZE = 32
EPOCH_SIZE = 1000
IMG_HEIGHT = 128
IMG_WIDTH = 512

<H4># 1. Load dataset<H4>

In [15]:
# READ LABELS AND IMAGES
def load_data(images_dir, labels_dir, ocr_txt_dir, IMG_HEIGHT, IMG_WIDTH):
    """
    Load images and labels from the specified directories.
    """
    X = []
    y = []
    print(f"Loading data from:")
    print(f"  Images: {images_dir}")
    print(f"  Labels: {labels_dir}")
    print(f"  OCR Text: {ocr_txt_dir}")

    processed_count = 0
    skipped_count = 0
    skipped_files = []
    for img_file in os.listdir(images_dir):
        if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        img_path = os.path.join(images_dir, img_file)
        base_filename = os.path.splitext(img_file)[0]
        label_path = os.path.join(labels_dir, base_filename + '.txt')
        gt_txt_path = os.path.join(ocr_txt_dir, base_filename + '.json')

        if not os.path.exists(label_path):
            print(f"  Skipping {img_file}: Label file not found at {label_path}")
            skipped_files.append(img_file)
            skipped_count += 1
            continue
        if not os.path.exists(gt_txt_path):
            print(f"  Skipping {img_file}: Ground truth text file not found at {gt_txt_path}")
            skipped_files.append(img_file)
            skipped_count += 1
            continue

        bboxes = read_label_file(label_path)
        gt_texts = read_gt_text(gt_txt_path)

        if len(bboxes) != len(gt_texts):
            print(f"  Skipping {img_file}: Mismatch between number of bounding boxes ({len(bboxes)}) and ground truth texts ({len(gt_texts)})")
            skipped_files.append(img_file)
            skipped_count += 1
            continue
        if not bboxes:
            print(f"  Skipping {img_file}: No bounding boxes found in label file")
            skipped_files.append(img_file)
            skipped_count += 1
            continue

        for (idx, bbox), text in zip(bboxes, gt_texts):
            try:
                crop = preprocess_img(img_path, bbox, IMG_HEIGHT, IMG_WIDTH)
                if crop is not None: # Only append if crop was successful
                    X.append((idx, crop))  # Store as (idx, crop)
                    y.append(text)
                    processed_count += 1
                else:
                    print(f"  Skipping crop for {img_file} with bbox {bbox}: Invalid bbox")
                    skipped_files.append(img_file)
                    skipped_count += 1
            except Exception as e:
                print(f"  Error processing {img_file} with bbox {bbox}: {e}")
                skipped_files.append(img_file)
                skipped_count += 1
                continue
    
    print(f"Finished loading data.")
    print(f"Total files processed: {processed_count + skipped_count}")
    print(f"Successfully loaded {processed_count} samples.")    
    if skipped_count > 0:
        print(f"Skipped {skipped_count} files/samples.")
        #print(f"Skipped files: {skipped_files}")
        print(f"Skipped files have been logged to {SKIPPED_FILES_LIST}")
        # Save skipped files to a text file
        with open(SKIPPED_FILES_LIST, 'w') as f:
            for file in skipped_files:
                f.write(f"{file}\n")

    return np.array(X, dtype=object), np.array(y)

In [16]:
# Load data (images and labels)
X, y = load_data(IMAGES_DIR, LABELS_DIR, OCR_TXT_DIR, IMG_HEIGHT, IMG_WIDTH)

Loading data from:
  Images: D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750
  Labels: results6\labels
  OCR Text: D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\LABELS_750
  Skipping MIT_1.jpg: Mismatch between number of bounding boxes (21) and ground truth texts (20)
  Skipping MIT_10.jpg: Mismatch between number of bounding boxes (24) and ground truth texts (20)
  Skipping MIT_102.jpg: Mismatch between number of bounding boxes (28) and ground truth texts (20)
  Skipping MIT_103.jpg: Mismatch between number of bounding boxes (22) and ground truth texts (20)
  Skipping MIT_104.jpg: Mismatch between number of bounding boxes (23) and ground truth texts (20)
  Skipping MIT_105.jpg: Mismatch between number of bounding boxes (25) and ground truth texts (20)
  Skipping MIT_106.jpg: Mismatch between number of bounding boxes (22) and ground truth texts (20)
  Skipping MIT_108.jpg: Mismatch between number of bounding boxes (21) and ground truth texts (20)
  Skipping MIT_1

<H4># 2. Build alphabet and encode labels<H4>


In [17]:
# BUILD ALPHABET FROM OCR LABELS excluding skipped images
if len(y) > 0:
    all_chars = set()
    for text in y:
        all_chars.update(text)
    alphabet = ''.join(sorted(list(all_chars)))
    num_chars = len(alphabet) + 1  # +1 for CTC blank
else:
    print("No data loaded, cannot build alphabet or proceed with training.")
    alphabet = ''
    num_chars = 1

print(f"Alphabet: {alphabet}")
print(f"Number of characters: {num_chars}")

Alphabet:  +,-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz’
Number of characters: 71


<H4># 3. Split train/test<H4>

In [18]:
TEST_SPLIT=0.3
RANDOM_STATE=45

In [19]:
if len(X) == 0:
    print("Error: No data loaded. Cannot perform train_test_split.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_STATE, shuffle=True)

    # Encode labels
    def encode_labels(texts, alphabet):
        char_to_idx = {c: i for i, c in enumerate(alphabet)}
        blank_index = len(alphabet)
        encoded = []
        for t in texts:
            encoded.append([char_to_idx.get(c, blank_index) for c in t])
        return pad_sequences(encoded, padding='post', value=blank_index, dtype='float32')
    y_train_encoded = encode_labels(y_train, alphabet)
    y_test_encoded = encode_labels(y_test, alphabet)

    # Prepare input/label lengths for CTC
    output_timesteps = IMG_HEIGHT // 4
    train_label_length = np.array([[len(t)] for t in y_train])
    test_label_length = np.array([[len(t)] for t in y_test])
    train_input_length = np.ones((len(X_train), 1)) * output_timesteps
    test_input_length = np.ones((len(X_test), 1)) * output_timesteps

    # Filter samples where label length > output_timesteps or label is empty
    def filter_by_label_length(X, y, y_encoded, label_length, max_len):
        indices = [i for i, l in enumerate(label_length) if 0 < l[0] <= max_len]
        crops = []
        for i in indices:
            crop = X[i][1]
            crop = np.asarray(crop)
            if crop.ndim == 2:
                crop = np.expand_dims(crop, axis=-1)
            crops.append(crop)
        X_f = np.stack(crops, axis=0)
        y_f = y[indices]
        y_encoded_f = y_encoded[indices]
        label_length_f = label_length[indices]
        return X_f, y_f, y_encoded_f, label_length_f

    X_train_filtered, y_train_filtered, y_train_encoded_filtered, train_label_length_filtered = filter_by_label_length(
        X_train, y_train, y_train_encoded, train_label_length, output_timesteps
    )
    X_test_filtered, y_test_filtered, y_test_encoded_filtered, test_label_length_filtered = filter_by_label_length(
        X_test, y_test, y_test_encoded, test_label_length, output_timesteps
    )

    X_train_filtered = X_train_filtered.astype(np.float32)
    X_test_filtered = X_test_filtered.astype(np.float32)
    train_input_length_filtered = np.ones((len(X_train_filtered), 1)) * output_timesteps
    test_input_length_filtered = np.ones((len(X_test_filtered), 1)) * output_timesteps

<H4># 4. Create model<H4>

In [20]:

if len(X) == 0:
    print("Error: No data loaded. Cannot perform train_test_split.")
else:
    def create_ocr_model_with_ctc(IMG_HEIGHT, IMG_WIDTH, num_chars):
        input_img = layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1), name='image')
        x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
        x = layers.MaxPooling2D((2, 2))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
        x = layers.MaxPooling2D((2, 2))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = layers.MaxPooling2D((2, 1), strides=(2, 1))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Reshape(target_shape=(IMG_HEIGHT // 4, -1))(x)
        x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
        x = layers.BatchNormalization()(x)
        output = layers.Dense(num_chars, activation='softmax')(x)

        input_label = layers.Input(name='the_labels', shape=[None], dtype='float32')
        input_len = layers.Input(name='input_length', shape=[None], dtype='int64')
        label_len = layers.Input(name='label_length', shape=[None], dtype='int64')

        class CTCLayer(layers.Layer):
            def __init__(self, name=None):
                super().__init__(name=name)
                self.loss_fn = tf.keras.backend.ctc_batch_cost
            def call(self, y_true, y_pred, input_length, label_length):
                y_true_int = tf.cast(y_true, dtype=tf.int32)
                input_length_int = tf.cast(input_length, dtype=tf.int32)
                label_length_int = tf.cast(label_length, dtype=tf.int32)
                loss = self.loss_fn(y_true=y_true_int, y_pred=y_pred, input_length=input_length_int, label_length=label_length_int)
                self.add_loss(tf.reduce_mean(loss))
                return y_pred

        ctc_layer = CTCLayer(name='ctc_loss')(input_label, output, input_len, label_len)
        model = models.Model(inputs=[input_img, input_label, input_len, label_len], outputs=ctc_layer)
        model.compile(optimizer='adam', loss=None)
        prediction_model = models.Model(inputs=input_img, outputs=output)
        return model, prediction_model

    ocr_model_train, ocr_model_predict = create_ocr_model_with_ctc(IMG_HEIGHT, IMG_WIDTH, num_chars)
    ocr_model_train.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image (InputLayer)             [(None, 128, 512, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 128, 512, 32  320         ['image[0][0]']                  
                                )                                                                 
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 64, 256, 32)  0           ['conv2d[0][0]']                 
                                                                                              

<H4># 5. Train and save model<H4>

In [114]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "OCR_Model-{epoch:04d}.h5",
    save_freq=100 * (len(X_train_filtered) // BATCH_SIZE),  # Save every 100 epochs
    save_weights_only=False,
    verbose=1
)

if len(X) == 0:
    print("Error: No data loaded. Cannot perform train_test_split.")
else:
    if len(X_train_filtered) == 0:
        print("No training data left after filtering. Skipping training.")
    else:
        if(MODEL_TRAIN_FROM_SCRATCH==True):
            history = ocr_model_train.fit(
                x=[X_train_filtered, y_train_encoded_filtered, train_input_length_filtered, train_label_length_filtered],
                batch_size=BATCH_SIZE,
                epochs=EPOCH_SIZE,
                validation_data=([X_test_filtered, y_test_encoded_filtered, test_input_length_filtered, test_label_length_filtered], None),
                verbose=1,
                callbacks=[checkpoint_cb],
                shuffle=True
            )
            # Save the model            
            ocr_model_train.save(TRAINED_MODEL_FILE_NAME)
            ocr_model_train.save_weights(TRAINED_MODEL_FILE_NAME)  # Save the model weights
            print("Model saved to :" + TRAINED_MODEL_FILE_NAME)
            # Load the prediction model separately            
            ocr_model_predict.load_weights(TRAINED_MODEL_FILE_NAME)
            print("Model loaded from :" + TRAINED_MODEL_FILE_NAME) 
        else:
            ocr_model_predict.load_weights(PRED_MODEL_FILE_NAME) 
            print("Model loaded from :" + PRED_MODEL_FILE_NAME)  


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
ocr_model_train.save(TRAINED_MODEL_FILE_NAME)
print("Model saved to :" + TRAINED_MODEL_FILE_NAME)           

In [21]:
ocr_model_train.load_weights(TRAINED_MODEL_FILE_NAME)
print("Model loaded from :" + TRAINED_MODEL_FILE_NAME)  

Model loaded from :ocr_model_trained3.h5


In [22]:
ocr_model_predict.load_weights(TRAINED_MODEL_FILE_NAME) 
print("Model loaded from :" + TRAINED_MODEL_FILE_NAME)  

Model loaded from :ocr_model_trained3.h5


In [None]:
ocr_model_predict.load_weights(PRED_MODEL_FILE_NAME) 
print("Model loaded from :" + PRED_MODEL_FILE_NAME)  

Model loaded from :ocr_model_pred2.h5


<H4># Inference 1 for full dataset (excluding skipped images)<H4>

In [23]:

# --- CTC DECODE ---
def decode_prediction(pred, alphabet):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = tf.keras.backend.ctc_decode(pred, input_length=tf.cast(input_len, dtype=tf.int32), greedy=True)[0][0].numpy()
    texts = []
    for res in results:
        text = ''.join([alphabet[i] for i in res if i >= 0 and i < len(alphabet)])
        texts.append(text)
    return texts

# --- OCR INFERENCE AND OUTPUT ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
ocr_results = []
gt_results = []

for img_file in notebook_tqdm(os.listdir(IMAGES_DIR), desc="OCR on images"):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    img_path = os.path.join(IMAGES_DIR, img_file)
    base_filename = os.path.splitext(img_file)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    output_txt_path = os.path.join(OUTPUT_DIR, base_filename + '.txt')

    bboxes = read_label_file(label_path)
    gt_texts = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts) or not bboxes:
        continue

    img_crops = []
    valid_gt_texts = []
    for (_, bbox), gt_text in zip(bboxes, gt_texts):
        crop = preprocess_img(img_path, bbox, IMG_HEIGHT, IMG_WIDTH)
        if crop is not None:
            img_crops.append(crop)
            valid_gt_texts.append(gt_text)
    if not img_crops:
        continue

    img_crops = np.array(img_crops)
    preds = ocr_model_predict.predict(img_crops)
    pred_texts = decode_prediction(preds, alphabet)

    with open(output_txt_path, 'w', encoding='utf-8') as f:
        for text in pred_texts:
            f.write(text + '\n')

    ocr_results.extend(pred_texts)
    gt_results.extend(valid_gt_texts)

OCR on images:   0%|          | 0/1500 [00:00<?, ?it/s]



<H4># Metrics 1<H4>

In [24]:
# --- METRICS ---

min_len = min(len(ocr_results), len(gt_results))
ocr_results = ocr_results[:min_len]
gt_results = gt_results[:min_len]

if ocr_results and gt_results:
    line_acc = accuracy_score(gt_results, ocr_results)
    def char_accuracy(y_true, y_pred):
        correct_chars = 0
        total_chars = 0
        for t, p in zip(y_true, y_pred):
            correct_chars += sum(a == b for a, b in zip(t, p))
            total_chars += max(len(t), len(p))
        return correct_chars / total_chars if total_chars > 0 else 1.0
    char_acc = char_accuracy(gt_results, ocr_results)
    print(f"OCR Line Accuracy: {line_acc:.4f}")
    print(f"OCR Character Accuracy: {char_acc:.4f}")
else:
    print("No OCR results to compare.")

OCR Line Accuracy: 0.6273
OCR Character Accuracy: 0.5574


<H4># Inference 2 for full dataset (excluding skipped images)<H4>

In [25]:
import time
import psutil
import numpy as np

# --- CTC DECODE ---
def decode_prediction(pred, alphabet):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = tf.keras.backend.ctc_decode(pred, input_length=tf.cast(input_len, dtype=tf.int32), greedy=True)[0][0].numpy()
    texts = []
    for res in results:
        text = ''.join([alphabet[i] for i in res if i >= 0 and i < len(alphabet)])
        texts.append(text)
    return texts

# --- OCR INFERENCE AND OUTPUT ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
ocr_results = []
gt_results = []

# Efficiency tracking variables
process = psutil.Process()
mem_usage_samples = []
start_time = None

for idx, img_file in enumerate(notebook_tqdm(os.listdir(IMAGES_DIR), desc="OCR on images")):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    img_path = os.path.join(IMAGES_DIR, img_file)
    base_filename = os.path.splitext(img_file)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    output_txt_path = os.path.join(OUTPUT_DIR, base_filename + '.txt')

    bboxes = read_label_file(label_path)
    gt_texts = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts) or not bboxes:
        continue

    # Start timing and memory tracking before prediction for each image
    start_time = time.time()
    mem_usage_samples = []

    img_crops = []
    valid_gt_texts = []
    for (_, bbox), gt_text in zip(bboxes, gt_texts):
        crop = preprocess_img(img_path, bbox, IMG_HEIGHT, IMG_WIDTH)
        if crop is not None:
            img_crops.append(crop)
            valid_gt_texts.append(gt_text)
    if not img_crops:
        continue

    img_crops = np.array(img_crops)
    preds = ocr_model_predict.predict(img_crops)
    pred_texts = decode_prediction(preds, alphabet)

    with open(output_txt_path, 'w', encoding='utf-8') as f:
        for text in pred_texts:
            f.write(text + '\n')

    ocr_results.extend(pred_texts)
    gt_results.extend(valid_gt_texts)

    # Update memory usage after each prediction
    mem_usage_samples.append(process.memory_info().rss / (1024 * 1024))  # MB

    # Efficiency metrics after each image
    elapsed_time = time.time() - start_time if start_time else 0.0

num_preds = len(ocr_results)
avg_time_per_doc = (elapsed_time / num_preds) if num_preds > 0 else 1.0
avg_mem_mb = np.mean(mem_usage_samples) if mem_usage_samples else 1.0

# Compute efficiency: 1 / ((avg processing time per doc in sec) * (avg memory usage in MB))
compute_efficiency = 1 / (avg_time_per_doc * avg_mem_mb) if avg_mem_mb > 0 else 0.0

print(f"[{idx+1}] Elapsed: {elapsed_time:.2f}s | Avg mem: {avg_mem_mb:.2f}MB | Avg time/doc: {avg_time_per_doc:.4f}s | Efficiency: {compute_efficiency:.6f}")

OCR on images:   0%|          | 0/1500 [00:00<?, ?it/s]

[1500] Elapsed: 0.31s | Avg mem: 7314.00MB | Avg time/doc: 0.0001s | Efficiency: 2.124173


<H4># Metrics 2<H4>

In [26]:
import time
import psutil
import numpy as np

# --- METRICS & EFFICIENCY ---
def levenshtein_distance(s1, s2):
    if isinstance(s1, str):
        s1 = list(s1)
    if isinstance(s2, str):
        s2 = list(s2)
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def word_error_rate(y_true, y_pred):
    total_words = 0
    total_errors = 0
    for t, p in zip(y_true, y_pred):
        t_words = t.split()
        p_words = p.split()
        total_words += len(t_words)
        total_errors += levenshtein_distance(t_words, p_words)
    return total_errors / total_words if total_words > 0 else 0.0

def char_error_rate(y_true, y_pred):
    total_chars = 0
    total_errors = 0
    for t, p in zip(y_true, y_pred):
        total_chars += len(t)
        total_errors += levenshtein_distance(t, p)
    return total_errors / total_chars if total_chars > 0 else 0.0

def field_accuracy(y_true, y_pred):
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_true, y_pred)

def document_level_accuracy(y_true, y_pred, doc_ids):
    from collections import defaultdict
    doc_true = defaultdict(list)
    doc_pred = defaultdict(list)
    for doc_id, t, p in zip(doc_ids, y_true, y_pred):
        doc_true[doc_id].append(t)
        doc_pred[doc_id].append(p)
    correct_docs = 0
    for doc_id in doc_true:
        if doc_true[doc_id] == doc_pred[doc_id]:
            correct_docs += 1
    return correct_docs / len(doc_true) if doc_true else 0.0

min_len = min(len(ocr_results), len(gt_results))
ocr_results = ocr_results[:min_len]
gt_results = gt_results[:min_len]

# Build doc_ids as the image file name for each field
# You must collect doc_ids in the same order as ocr_results/gt_results
doc_ids = []
for img_file in os.listdir(IMAGES_DIR):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    base_filename = os.path.splitext(img_file)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    if not os.path.exists(label_path) or not os.path.exists(gt_txt_path):
        continue
    bboxes = read_label_file(label_path)
    gt_texts_file = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts_file) or not bboxes:
        continue
    for _ in range(len(bboxes)):
        doc_ids.append(base_filename)

# Truncate doc_ids to match ocr_results length
doc_ids = doc_ids[:min_len]

if not ocr_results or not gt_results:
    print("\nNo samples available to compute metrics.")
else:
    # Metrics
    wer = word_error_rate(gt_results, ocr_results)
    cer = char_error_rate(gt_results, ocr_results)
    field_acc = field_accuracy(gt_results, ocr_results)
    doc_acc = document_level_accuracy(gt_results, ocr_results, doc_ids)    
    
    # Final Score
    final_score = (
        0.35 * (100 - wer * 100) +
        0.35 * (100 - cer * 100) +
        0.15 * field_acc +
        0.15 * doc_acc
    )

    print(f"\nOCR Model Metrics:")
    print(f"Word Error Rate (WER): {wer:.4f}")
    print(f"Character Error Rate (CER): {cer:.4f}")
    print(f"Field Accuracy: {field_acc:.4f}")
    print(f"Document Level Accuracy: {doc_acc:.4f}")
    print(f"Final Score: {final_score:.4f}")
    print(f"Compute Efficiency: {compute_efficiency:.6f} (1/(sec*MB))")


OCR Model Metrics:
Word Error Rate (WER): 0.4804
Character Error Rate (CER): 0.3662
Field Accuracy: 0.6273
Document Level Accuracy: 0.0000
Final Score: 40.4633
Compute Efficiency: 2.124173 (1/(sec*MB))


<H4># Inference 3 for full dataset (including skipped images)<H4>

In [None]:
import time
import psutil
import numpy as np

# --- CTC DECODE ---
def decode_prediction(pred, alphabet):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    results = tf.keras.backend.ctc_decode(pred, input_length=tf.cast(input_len, dtype=tf.int32), greedy=True)[0][0].numpy()
    texts = []
    for res in results:
        text = ''.join([alphabet[i] for i in res if i >= 0 and i < len(alphabet)])
        texts.append(text)
    return texts

# --- OCR INFERENCE AND OUTPUT ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
ocr_results = []
gt_results = []

# Efficiency tracking variables
process = psutil.Process()
mem_usage_samples = []
start_time = None

for idx, img_file in enumerate(notebook_tqdm(os.listdir(IMAGES_DIR), desc="OCR on images")):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    img_path = os.path.join(IMAGES_DIR, img_file)
    base_filename = os.path.splitext(img_file)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    output_txt_path = os.path.join(OUTPUT_DIR, base_filename + '.txt')

    bboxes = read_label_file(label_path)
    gt_texts = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts) or not bboxes:
        print(f"Processing {img_file}: Mismatch between number of bounding boxes and ground truth texts")        

    # Start timing and memory tracking before prediction for each image
    start_time = time.time()
    mem_usage_samples = []

    img_crops = []
    valid_gt_texts = []
    for (_, bbox), gt_text in zip(bboxes, gt_texts):
        crop = preprocess_img(img_path, bbox, IMG_HEIGHT, IMG_WIDTH)
        if crop is not None:
            img_crops.append(crop)
            valid_gt_texts.append(gt_text)
    if not img_crops:
        continue

    img_crops = np.array(img_crops)
    preds = ocr_model_predict.predict(img_crops)
    pred_texts = decode_prediction(preds, alphabet)

    with open(output_txt_path, 'w', encoding='utf-8') as f:
        for text in pred_texts:
            f.write(text + '\n')

    ocr_results.extend(pred_texts)
    gt_results.extend(valid_gt_texts)

    # Update memory usage after each prediction
    mem_usage_samples.append(process.memory_info().rss / (1024 * 1024))  # MB

    # Efficiency metrics after each image
    elapsed_time = time.time() - start_time if start_time else 0.0

num_preds = len(ocr_results)
avg_time_per_doc = (elapsed_time / num_preds) if num_preds > 0 else 1.0
avg_mem_mb = np.mean(mem_usage_samples) if mem_usage_samples else 1.0

# Compute efficiency: 1 / ((avg processing time per doc in sec) * (avg memory usage in MB))
compute_efficiency = 1 / (avg_time_per_doc * avg_mem_mb) if avg_mem_mb > 0 else 0.0

print(f"[{idx+1}] Elapsed: {elapsed_time:.2f}s | Avg mem: {avg_mem_mb:.2f}MB | Avg time/doc: {avg_time_per_doc:.4f}s | Efficiency: {compute_efficiency:.6f}")

OCR on images:   0%|          | 0/1500 [00:00<?, ?it/s]

Processing MIT_1.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_10.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_102.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_103.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_104.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_105.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_106.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_108.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_109.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_11.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_110.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_111.jpg: M

<H4># Metrics 3<H4>

In [116]:
import time
import psutil
import numpy as np

# --- METRICS & EFFICIENCY ---
def levenshtein_distance(s1, s2):
    if isinstance(s1, str):
        s1 = list(s1)
    if isinstance(s2, str):
        s2 = list(s2)
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def word_error_rate(y_true, y_pred):
    total_words = 0
    total_errors = 0
    for t, p in zip(y_true, y_pred):
        t_words = t.split()
        p_words = p.split()
        total_words += len(t_words)
        total_errors += levenshtein_distance(t_words, p_words)
    return total_errors / total_words if total_words > 0 else 0.0

def char_error_rate(y_true, y_pred):
    total_chars = 0
    total_errors = 0
    for t, p in zip(y_true, y_pred):
        total_chars += len(t)
        total_errors += levenshtein_distance(t, p)
    return total_errors / total_chars if total_chars > 0 else 0.0

def field_accuracy(y_true, y_pred):
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_true, y_pred)

def document_level_accuracy(y_true, y_pred, doc_ids):
    from collections import defaultdict
    doc_true = defaultdict(list)
    doc_pred = defaultdict(list)
    for doc_id, t, p in zip(doc_ids, y_true, y_pred):
        doc_true[doc_id].append(t)
        doc_pred[doc_id].append(p)
    correct_docs = 0
    for doc_id in doc_true:
        if doc_true[doc_id] == doc_pred[doc_id]:
            correct_docs += 1
    return correct_docs / len(doc_true) if doc_true else 0.0

min_len = min(len(ocr_results), len(gt_results))
ocr_results = ocr_results[:min_len]
gt_results = gt_results[:min_len]

# Build doc_ids as the image file name for each field
# You must collect doc_ids in the same order as ocr_results/gt_results
doc_ids = []
for img_file in os.listdir(IMAGES_DIR):
    if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue
    base_filename = os.path.splitext(img_file)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    if not os.path.exists(label_path) or not os.path.exists(gt_txt_path):
        continue
    bboxes = read_label_file(label_path)
    gt_texts_file = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts_file) or not bboxes:
        continue
    for _ in range(len(bboxes)):
        doc_ids.append(base_filename)

# Truncate doc_ids to match ocr_results length
doc_ids = doc_ids[:min_len]

if not ocr_results or not gt_results:
    print("\nNo samples available to compute metrics.")
else:
    # Metrics
    wer = word_error_rate(gt_results, ocr_results)
    cer = char_error_rate(gt_results, ocr_results)
    field_acc = field_accuracy(gt_results, ocr_results)
    doc_acc = document_level_accuracy(gt_results, ocr_results, doc_ids)    
    
    # Final Score
    final_score = (
        0.35 * (100 - wer * 100) +
        0.35 * (100 - cer * 100) +
        0.15 * field_acc +
        0.15 * doc_acc
    )

    print(f"\nOCR Model Metrics:")
    print(f"Word Error Rate (WER): {wer:.4f}")
    print(f"Character Error Rate (CER): {cer:.4f}")
    print(f"Field Accuracy: {field_acc:.4f}")
    print(f"Document Level Accuracy: {doc_acc:.4f}")
    print(f"Final Score: {final_score:.4f}")
    print(f"Compute Efficiency: {compute_efficiency:.6f} (1/(sec*MB))")


OCR Model Metrics:
Word Error Rate (WER): 1.0680
Character Error Rate (CER): 0.8546
Field Accuracy: 0.1281
Document Level Accuracy: 0.0000
Final Score: 2.7273
Compute Efficiency: 28.513292 (1/(sec*MB))


<H4># Evaluation for single image using trained model<H4>

In [None]:
# --- OCR PROCESSING FOR LABELS ---
from PIL import Image
import pytesseract
from tqdm.notebook import tqdm
import os
import cv2
import numpy as np


def test_single_img (img_name, label_path, gt_txt_path, output_txt_path, ocr_model_predict, alphabet):
    """
    Test a single image with the OCR model.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    ocr_results = []
    gt_results = []

    if not img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
        print(f"Skipping {img_name}: Not a valid image file")
    img_path = os.path.join(IMAGES_DIR, img_name)
    base_filename = os.path.splitext(img_name)[0]
    label_path = os.path.join(LABELS_DIR, base_filename + '.txt')
    gt_txt_path = os.path.join(OCR_TXT_DIR, base_filename + '.json')
    output_txt_path = os.path.join(OUTPUT_DIR, base_filename + '.txt')

    bboxes = read_label_file(label_path)
    gt_texts = read_gt_text(gt_txt_path)
    if len(bboxes) != len(gt_texts) or not bboxes:
        print(f"Processing {img_name}: Mismatch between number of bounding boxes and ground truth texts")

    img_crops = []
    valid_gt_texts = []
    for (_, bbox), gt_text in zip(bboxes, gt_texts):
        crop = preprocess_img(img_path, bbox, IMG_HEIGHT, IMG_WIDTH)
        if crop is not None:
            img_crops.append(crop)
            valid_gt_texts.append(gt_text)
    if not img_crops:
        print(f"Processing {img_name}: No valid crops found")
    else:
        print(f"Processing {img_name}: Found {len(img_crops)} valid crops")
        img_crops = np.array(img_crops)
        preds = ocr_model_predict.predict(img_crops)
        pred_texts = decode_prediction(preds, alphabet)

        with open(output_txt_path, 'w', encoding='utf-8') as f:
            for text in pred_texts:
                f.write(text + '\n')

    ocr_results.extend(pred_texts)
    gt_results.extend(valid_gt_texts)

    print("Ground Truth:")
    for txt in gt_results:
        print(txt)
    print("\nOCR Results:")
    for txt in ocr_results:
        print(txt)

    # --- METRICS ---

    min_len = min(len(ocr_results), len(gt_results))
    ocr_results = ocr_results[:min_len]
    gt_results = gt_results[:min_len]

    if ocr_results and gt_results:
        line_acc = accuracy_score(gt_results, ocr_results)
        def char_accuracy(y_true, y_pred):
            correct_chars = 0
            total_chars = 0
            for t, p in zip(y_true, y_pred):
                correct_chars += sum(a == b for a, b in zip(t, p))
                total_chars += max(len(t), len(p))
            return correct_chars / total_chars if total_chars > 0 else 1.0
        char_acc = char_accuracy(gt_results, ocr_results)
        print(f"OCR Line Accuracy: {line_acc:.4f}")
        print(f"OCR Character Accuracy: {char_acc:.4f}")
    else:
        print("No OCR results to compare.")

In [None]:
TEST_IMAGE = 'MIT_1.jpg'
LABELS_DIR = r'd:\AI_Challenge\DeHaDo-AI\results6\labels'
IMAGES_DIR = r'D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750'
OUTPUT_DIR = r'd:\AI_Challenge\DeHaDo-AI\results6\eval_output'
IMG_HEIGHT = 128
IMG_WIDTH = 512
test_single_img(TEST_IMAGE, LABELS_DIR, IMAGES_DIR, OUTPUT_DIR, ocr_model_predict, alphabet)

Processing MIT_1.jpg: Mismatch between number of bounding boxes and ground truth texts
Processing MIT_1.jpg: Found 20 valid crops
Ground Truth:
Dayita Bakshi
Laksh Bakshi
12/27/1975
Post-Graduate
Married
Female
Indian
A+
9 years at Dewan Ltd
10 years at Chahal PLC
H.No. 133, Bala Circle, Bongaigaon-379402
H.No. 23, Chaudry Path, Nadiad-559407
9787612803
9351045691
Marathi, English, Telugu, Hindi
Kritika Brar - 7099406444
Janaki Handa - 8634823848
358000000000.0
Dehradun
6/17/2023

OCR Results:
Fed S4mas
8/13/ydd05
Id3
Demian
42t Tass
91403470
BhitTaa-044844
Mas
Mali
Zayi ashl
Gradian
82662033
O+
Kjaria at  8/7462156
DriraMawal
Kali, Matta 708a4
9D030a04
9alr a615
Iediale
1/21/202025
OCR Line Accuracy: 0.0000
OCR Character Accuracy: 0.0319


<H4># Evaluation for full dataset (excluding skipped images) using pytesseract<H4>

<H6>
<br>Tesseract Installation For Windows
<br>
<br>1 - You need to have Tesseract OCR installed on your computer.
<br>    get it from here. https://github.com/UB-Mannheim/tesseract/wiki
<br>    Download the suitable version.
<br>2 - Add Tesseract path to your System Environment. i.e. Edit system variables.
<br>3 - Run pip install pytesseract and pip install tesseract
<br>4 - Add this line to your python script every time
<br>   pytesseract.pytesseract.tesseract_cmd = 'C:/OCR/Tesseract-OCR/tesseract.exe'  # your path may be different
<br>5 - Run the code.
<H6>

In [30]:
!pip install pytesseract
!pip install tesseract
!pip install pillow




[notice] A new release of pip is available: 24.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# --- OCR PROCESSING FOR LABELS ---
from PIL import Image
import pytesseract
from tqdm.notebook import tqdm
import os
import cv2
import numpy as np

LABELS_DIR = r'd:\AI_Challenge\DeHaDo-AI\results4\labels'
IMAGES_DIR = r'D:\AI_Challenge\DeHaDo-AI\DEHADO-AI_TRAINING_DATASET\IMAGES_750'
OUTPUT_DIR = r'd:\AI_Challenge\DeHaDo-AI\results4\tesseract_output'
pytesseract.pytesseract.tesseract_cmd = r'D:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path if needed

os.makedirs(OUTPUT_DIR, exist_ok=True)

for label_file in tqdm(os.listdir(LABELS_DIR), desc="Processing label files"):
    if not label_file.endswith('.txt'):
        continue
    label_path = os.path.join(LABELS_DIR, label_file)
    image_name = os.path.splitext(label_file)[0] + '.jpg'
    image_path = os.path.join(IMAGES_DIR, image_name)
    if not os.path.exists(image_path):
        print(f"Image not found for {label_file}")
        continue

    image = Image.open(image_path)
    ocr_results = []
    with open(label_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 6:
                continue
            x1, y1, x2, y2 = map(int, parts[3:7])
            cropped = image.crop((x1, y1, x2, y2))

            # Define scaling factors (e.g., 2 for doubling the size)
            scale_x = .5
            scale_y = .5
            # Resize the image using cv2.resize()
            # You can specify interpolation methods for better quality
            # INTER_LINEAR is good for enlarging, INTER_CUBIC is slower but higher quality
            cropped = np.array(cropped)
            scaled_up_image = cv2.resize(cropped, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            # Convert the scaled image to a format suitable for OCR
            scaled_up_image = cv2.cvtColor(scaled_up_image, cv2.COLOR_BGR2RGB)
            
            # Perform OCR on the scaled image
            text = pytesseract.image_to_string(scaled_up_image, lang='eng')
            ocr_results.append(text.strip())

    output_file = os.path.join(OUTPUT_DIR, label_file)
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for text in ocr_results:
            out_f.write(text + '\n')

Processing label files:   0%|          | 0/1500 [00:00<?, ?it/s]

KeyboardInterrupt: 