In [1]:
import shutil
from pathlib import Path

# Source and target paths
SOURCE_ROOT = Path("G:/Sajal_Data/Obj_4_Code/Teacher_model_training/main_Data")  # Each subfolder like case1, case2...
TARGET_FOLDER = Path("G:/Sajal_Data/Obj_4_Code/Teacher_model_training/data/images")
TARGET_FOLDER.mkdir(parents=True, exist_ok=True)

# Supported image extensions
image_exts = [".jpg", ".jpeg", ".png"]

# Copy images with case number included in filename
copied_files = []
for case_folder in sorted(SOURCE_ROOT.glob("case*")):
    case_id = case_folder.name.replace(" ", "_").lower()  # Normalize "case 1" to "case_1"
    for img_file in case_folder.glob("*"):
        if img_file.suffix.lower() in image_exts:
            new_name = f"{case_id}_{img_file.name}"
            dest_path = TARGET_FOLDER / new_name
            shutil.copy(img_file, dest_path)
            copied_files.append(new_name)

copied_files[:5]  # Show a sample of copied filenames


['case_1_IMG_20240227_2_1.jpg',
 'case_1_IMG_20240227_2_10.jpg',
 'case_1_IMG_20240227_2_11.jpg',
 'case_1_IMG_20240227_2_12.jpg',
 'case_1_IMG_20240227_2_13.jpg']

In [3]:
import cv2
import pytesseract
import shutil
from pathlib import Path
import numpy as np

# Input/output directories
INPUT_DIR = Path("G:/Sajal_Data/Obj_4_Code/Teacher_model_training/Filtered_Data")
OUTPUT_DIR = Path("G:/Sajal_Data/Obj_4_Code/Teacher_model_training/data/images")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Keywords to check for in OCR text
keywords = ["HC", "AC", "FL", "BPD"]

# Supported image formats
image_exts = [".jpg", ".jpeg", ".png"]

# Function to extract bottom-right region for OCR
def extract_bottom_right(image, fraction=0.25):
    h, w = image.shape[:2]
    return image[int(h*(1-fraction)):, int(w*(1-fraction)):]

# Function to enhance image for better OCR
def preprocess_for_ocr(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    inverted = cv2.bitwise_not(gray)
    thresh = cv2.adaptiveThreshold(inverted, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                   cv2.THRESH_BINARY, 11, 4)
    return thresh

copied = []

# Process all images
for image_path in INPUT_DIR.glob("*"):
    if image_path.suffix.lower() in image_exts:
        img = cv2.imread(str(image_path))
        roi = extract_bottom_right(img)
        ocr_ready = preprocess_for_ocr(roi)
        text = pytesseract.image_to_string(ocr_ready).upper().replace(" ", "")

        # Check if any keyword is present
        if any(k in text for k in keywords):
            dest_path = OUTPUT_DIR / image_path.name
            shutil.copy(image_path, dest_path)
            copied.append(image_path.name)

copied[:5]  # Show some sample matches

['case_10_IMG_20240412_1_2.jpg',
 'case_10_IMG_20240412_1_3.jpg',
 'case_10_IMG_20240412_1_4.jpg',
 'case_10_IMG_20240412_1_5.jpg',
 'case_11_IMG_20240412_3_13.jpg']