In [None]:
# Data Preprocessing for OCR

#This notebook preprocesses the raw data images and saves them in the processed format.

## Import Libraries


import os
import cv2
import numpy as np

# Directories
raw_data_dir = '../data/raw/'
processed_data_dir = '../data/processed/'

# Ensure the processed data directory exists
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)


In [None]:
def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Failed to load image: {image_path}")
    img = cv2.resize(img, (128, 32))  # Resize to match model input
    img = img / 255.0  # Normalize
    return img

def process_and_save_images():
    for filename in os.listdir(raw_data_dir):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(raw_data_dir, filename)
            try:
                img = preprocess_image(image_path)
                np.save(os.path.join(processed_data_dir, os.path.splitext(filename)[0] + '.npy'), img)
                print(f'Processed and saved: {filename}')
            except ValueError as e:
                print(e)

# Process and save images
process_and_save_images()


In [None]:
# Load and verify a processed image
sample_image = np.load(os.path.join(processed_data_dir, os.listdir(processed_data_dir)[0]))
print(f'Sample image shape: {sample_image.shape}')
print(f'Sample image data:\n{sample_image}')
