In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install qwen-vl-utils
!pip install Pillow
!pip install --upgrade datasets

In [None]:
!pip install -U bitsandbytes

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
import torch
import pandas as pd
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from PIL import Image
from IPython.display import display
from google.colab.patches import cv2_imshow
from difflib import SequenceMatcher

In [None]:
from datasets import load_dataset
dataset = load_dataset("shreyanithin/hmi-gui-ocr",split="train")
print(type(dataset))
csv_file_path = "/kaggle/input/ground-truth/ground truth.csv"
df = pd.read_csv(csv_file_path, encoding="latin-1")

In [None]:
!pip install jiwer

In [None]:
import jiwer
from jiwer import cer,wer

def normalize(text):
    return text.lower().strip().replace("\n", "").replace(" ", " ")

def char_accuracy(gt, ocr):
    return SequenceMatcher(None, gt, ocr).ratio() * 100

def fuzzy_match(w1, w2, threshold=0.8):
    return SequenceMatcher(None, w1, w2).ratio() >= threshold

def word_accuracy(gt, ocr):
    gt_words = normalize(gt).split()
    ocr_words = normalize(ocr).split()
    correct = sum(1 for w in gt_words if any(fuzzy_match(w, ow) for ow in ocr_words))
    return (correct / len(gt_words)) * 100 if gt_words else 0

def cer(gt, pred):
    gt = gt.strip().lower()
    pred = pred.strip().lower()
    cer_score = jiwer.cer(gt, pred)
    return cer_score

def wer(gt, pred):
    gt = gt.strip().lower()
    pred = pred.strip().lower()
    wer_score = jiwer.wer(gt, pred)
    return wer_score


In [None]:
import gc; gc.collect()

In [None]:
checkpoint="Qwen/Qwen2.5-VL-7B-Instruct"
model=Qwen2_5_VLForConditionalGeneration.from_pretrained(checkpoint,device_map="auto", torch_dtype=torch.bfloat16, load_in_4bit=True)
processor=AutoProcessor.from_pretrained(checkpoint)

In [None]:
torch.cuda.empty_cache()

In [None]:
def interference(image,prompt,sys_prompt="You are a helpful assistant.", max_new_tokens=256, return_input=False):


  messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"image": image},
            ]
        },
    ]
  text=processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
  print("text:",text)
  inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
  inputs = inputs.to('cuda')

  output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
  if return_input:
      return output_text[0], inputs
  else:
      return output_text[0]

In [None]:
import time
results = []

# Loop through the dataset
for idx, sample in enumerate(dataset):
    img_pil = sample["image"]
    filename = sample["filename"]
    img_pil = img_pil.resize((800, 800))

    start_time = time.time()
    try:
        output_text = interference(img_pil, prompt="You are a precise OCR engine. Extract only the exact printed text from the image. Do not guess or paraphrase. Preserve original spelling, punctuation, and formatting. Output only what is visible. Do not explain.")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        continue

    elapsed = time.time() - start_time 
    # Append filename and output text
    results.append({
        "filename": filename,
        "ocr_output": output_text,
        "inference_time_sec": round(elapsed, 2)
    })

    print(f"{filename}: OCR done in {elapsed:.2f} sec.")

# Save results to CSV
ocr_output_df = pd.DataFrame(results)
ocr_output_df.to_csv("ocr_outputs.csv", index=False)
print("Saved OCR results to 'ocr_outputs.csv'")


In [None]:
import time
results = []

for idx, sample in enumerate(dataset):
    img_pil = sample["image"]
    filename = sample["filename"]
    img_pil = img_pil.resize((800, 800))

    gt_row = df[df["filename"] == filename]
    if gt_row.empty:
        print(f"{filename}: Ground truth not found.")
        continue

    ground_truth = gt_row["ground_truth"].iloc[0]

    start_time = time.time()
    try:
        output_text = interference(img_pil, prompt="You are a precise OCR engine. Extract only the exact printed text from the image. Do not guess or paraphrase. Preserve original spelling, punctuation, and formatting. Output only what is visible. Do not explain.")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        continue
    elapsed = time.time() - start_time

    # Normalize for comparison
    gt_norm = normalize(ground_truth)
    ocr_norm = normalize(output_text)

    # Accuracy metrics
    char_acc = char_accuracy(gt_norm, ocr_norm)
    word_acc = word_accuracy(gt_norm, ocr_norm)
    cer_val = cer(gt_norm, ocr_norm)
    wer_val = wer(gt_norm, ocr_norm)

    results.append({
        "filename": filename,
        "ground_truth": ground_truth,
        "ocr_output": output_text,
        "character_accuracy": round(char_acc, 2),
        "word_accuracy": round(word_acc, 2),
        "CER": round(cer_val * 100, 2),  # % form
        "WER": round(wer_val * 100, 2),  # % form
        "inference_time_sec": round(elapsed, 2)
    })

    print(f"{filename}: OCR done in {elapsed:.2f}s | CharAcc: {char_acc:.2f}% | CER: {cer_val*100:.2f}%")

# Save results
ocr_output_df = pd.DataFrame(results)
ocr_output_df.to_csv("ocr_outputs_with_accuracy.csv", index=False)
print("✅ Saved detailed OCR results to 'ocr_outputs_with_accuracy.csv'")
