In [None]:
#install necessary libraries
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install --upgrade datasets

In [None]:
import cv2
import pytesseract
import numpy as np
import pandas as pd
import string
from google.colab.patches import cv2_imshow
from PIL import Image
from datasets import load_dataset
from difflib import SequenceMatcher

In [None]:
#Loading dataset and ground truth
from datasets import load_dataset
dataset = load_dataset("shreyanithin/hmi-gui-ocr",split="train")
print(type(dataset))
csv_file_path = "/kaggle/input/ground-truth/ground truth.csv"
df = pd.read_csv(csv_file_path, encoding="latin-1")

In [None]:
#install jiwer library for cer and wer
!pip install jiwer

In [None]:
#To find different accuraccy metrices
import jiwer
from jiwer import cer,wer

def normalize(text):
    return text.lower().strip().replace("\n", "").replace(" ", " ")

def char_accuracy(gt, ocr):
    return SequenceMatcher(None, gt, ocr).ratio() * 100

def fuzzy_match(w1, w2, threshold=0.8):
    return SequenceMatcher(None, w1, w2).ratio() >= threshold

def word_accuracy(gt, ocr):
    gt_words = normalize(gt).split()
    ocr_words = normalize(ocr).split()
    correct = sum(1 for w in gt_words if any(fuzzy_match(w, ow) for ow in ocr_words))
    return (correct / len(gt_words)) * 100 if gt_words else 0

def cer(gt, pred):
    gt = gt.strip().lower()
    pred = pred.strip().lower()
    cer_score = jiwer.cer(gt, pred)
    return cer_score

def wer(gt, pred):
    gt = gt.strip().lower()
    pred = pred.strip().lower()
    wer_score = jiwer.wer(gt, pred)
    return wer_score


In [None]:
#Tesseract Configurations
configs = ["--psm 6", "--psm 11"]
results = []
import time

In [None]:
#To Print the Plain output to a csv file
output_data = []

for idx, sample in enumerate(dataset):
    img_pil = sample["image"]
    filename = sample["filename"]
    img_cv2 = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    start_time = time.time()

    # === Preprocessing ===
    gray = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(3, 3)).apply(gray)
    bilateral = cv2.bilateralFilter(clahe, 9, 60, 75)
    morph = cv2.morphologyEx(bilateral, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
    thresh = cv2.adaptiveThreshold(morph, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 9, 2)

    # === Contour detection ===
    contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    text_regions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = cv2.contourArea(contour)
        ar = w / h if h != 0 else 0
        if 150 < area < 10000 and 0.5 < ar < 15:
            pad = 5
            x_p, y_p = max(0, x - pad), max(0, y - pad)
            w_p = min(img_cv2.shape[1] - x_p, w + 2 * pad)
            h_p = min(img_cv2.shape[0] - y_p, h + 2 * pad)
            text_regions.append((x_p, y_p, w_p, h_p))

    text_regions.sort(key=lambda r: (r[1], r[0]))

    # === OCR from ROIs ===
    roi_text = ""
    for (x, y, w, h) in text_regions:
        roi = img_cv2[y:y + h, x:x + w]
        roi = cv2.resize(roi, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
        data = pytesseract.image_to_data(roi, output_type=pytesseract.Output.DICT, config="--psm 6")
        for i in range(len(data['text'])):
            if int(data['conf'][i]) > 60 and data['text'][i].strip():
                roi_text += data['text'][i] + " "

    # === Full image OCR with multiple configs ===
    best_full_text = ""
    best_len = 0
    gray_full = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2GRAY)
    gray_full = cv2.resize(gray_full, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
    for config in configs:
        text = pytesseract.image_to_string(gray_full, config=config)
        if len(text) > best_len:
            best_len = len(text)
            best_full_text = text

    # === Combine and clean ===
    combined = roi_text + " " + best_full_text
    combined = ''.join(filter(lambda x: x in string.printable, combined)).strip()

    elapsed = time.time() - start_time

    # === Save required fields ===
    output_data.append({
        "filename": filename,
        "extracted_text": combined,
        "inference_time_sec": round(elapsed, 2)
    })

# === Export to CSV ===
df_out = pd.DataFrame(output_data)
df_out.to_csv("tesseract_op.csv", index=False)

print("Results saved to tesseract_op.csv")

In [None]:
#To print the output and accuracy metrices of only those images with ground truth
for idx, sample in enumerate(dataset):
    img_pil = sample["image"]
    filename = sample["filename"]
    img_cv2 = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    start_time = time.time()

    # === Preprocessing ===
    gray = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(3, 3)).apply(gray)
    bilateral = cv2.bilateralFilter(clahe, 9, 60, 75)
    morph = cv2.morphologyEx(bilateral, cv2.MORPH_OPEN, np.ones((2, 2), np.uint8))
    thresh = cv2.adaptiveThreshold(morph, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 9, 2)

    # === Contour detection ===
    contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    text_regions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = cv2.contourArea(contour)
        ar = w / h if h != 0 else 0
        if 150 < area < 10000 and 0.5 < ar < 15:
            pad = 5
            x_p, y_p = max(0, x - pad), max(0, y - pad)
            w_p = min(img_cv2.shape[1] - x_p, w + 2 * pad)
            h_p = min(img_cv2.shape[0] - y_p, h + 2 * pad)
            text_regions.append((x_p, y_p, w_p, h_p))

    text_regions.sort(key=lambda r: (r[1], r[0]))

    # === OCR from ROIs ===
    roi_text = ""
    for (x, y, w, h) in text_regions:
        roi = img_cv2[y:y + h, x:x + w]
        roi = cv2.resize(roi, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
        data = pytesseract.image_to_data(roi, output_type=pytesseract.Output.DICT, config="--psm 6")
        for i in range(len(data['text'])):
            if int(data['conf'][i]) > 60 and data['text'][i].strip():
                roi_text += data['text'][i] + " "

    # === Full image OCR with multiple configs ===
    best_full_text = ""
    best_len = 0
    gray_full = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2GRAY)
    gray_full = cv2.resize(gray_full, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
    for config in configs:
        text = pytesseract.image_to_string(gray_full, config=config)
        if len(text) > best_len:
            best_len = len(text)
            best_full_text = text

    # === Combine and clean ===
    combined = roi_text + " " + best_full_text
    combined = ''.join(filter(lambda x: x in string.printable, combined)).strip()

    elapsed = time.time() - start_time

    # === Evaluation ===
    gt_row = df[df["filename"] == filename]
    if not gt_row.empty:
        ground_truth = gt_row["ground_truth"].iloc[0]
        gt_norm = normalize(ground_truth)
        ocr_norm = normalize(combined)
        char_acc = char_accuracy(gt_norm, ocr_norm)
        word_acc = word_accuracy(gt_norm, ocr_norm)
        cer_val = cer(gt_norm, ocr_norm)
        wer_val = wer(gt_norm, ocr_norm)

        results.append({
            "filename": filename,
            "extracted_text": combined,
            "ground_truth": ground_truth,
            "character_accuracy": char_acc,
            "word_accuracy": word_acc,
            "CER": round(cer_val * 100, 2),  # % form
            "WER": round(wer_val * 100, 2),  # % form
            "inference_time_sec": round(elapsed, 2)
        })
        print(f"{filename}: Char Acc = {char_acc:.2f}%, Word Acc = {word_acc:.2f}%")
    else:
        print(f"{filename}: Ground truth not found.")

In [None]:
#To move to csv file
pd.DataFrame(results).to_csv("ocr_tesseract.csv", index=False)
print("\n✅ OCR results saved to ocr_tesseract.csv")