# 🚀 Blood Report OCR + LayoutLMv2 Processing Pipeline
This notebook extracts text from blood test reports using OCR, cleans and processes the text, applies LayoutLMv2 for structured entity extraction, and saves the results to a CSV file.

In [None]:

!apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image
!pip install --upgrade torch torchvision torchaudio transformers datasets seqeval pillow


In [None]:

import os
import re
import random
import numpy as np
import pytesseract
import cv2
import torch
import shutil
import csv
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
from datasets import Dataset
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification, TrainingArguments, Trainer
from google.colab import files
from faker import Faker

fake = Faker()  # Initialize Faker for random names

# If using Windows, set Tesseract path (update based on installation)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [None]:

def preprocess_image(image_path):
    """Preprocesses image for better OCR accuracy: Resize, Binarize, Denoise."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (1000, 1000))  # Resize for uniformity
    image = cv2.GaussianBlur(image, (5, 5), 0)  # Reduce noise
    _, image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)  # Convert to binary
    return image


In [None]:

def extract_text_from_image(image_path):
    """Extracts text from a blood report image using Tesseract OCR."""
    processed_image = preprocess_image(image_path)
    extracted_text = pytesseract.image_to_string(processed_image, config="--psm 6")
    return extracted_text


In [None]:

def normalize_patient_name(text):
    """Standardizes 'Patient Name', 'Name', and similar fields."""
    name_variants = ["Patient Name", "Name", "Full Name", "Pt. Name", "Patient's Name"]
    
    for variant in name_variants:
        if variant in text:
            text = text.replace(variant, "Patient Name")
    
    return text


In [None]:

def extract_lab_parameters(text):
    """Extracts lab parameter values from OCR text and fixes common OCR misreadings."""
    parameters = {
        "Hemoglobin": r"Hemoglobin[:\s]*([\d.]+)\s*(g/dL|g/dL)?",
        "RBC": r"RBC[:\s]*([\d.]+)\s*(million/cmm|million/?cmm)?",
        "PCV": r"PCV[:\s]*([\d.]+)\s*(%)?",
        "MCV": r"MCV[:\s]*([\d.]+)\s*(fL|fl)?",
        "MCH": r"MCH[:\s]*([\d.]+)\s*(pg|Pg)?",
        "MCHC": r"MCHC[:\s]*([\d.]+)\s*(g/dL|gdl)?",
        "RDW": r"RDW[:\s]*([\d.]+)\s*(%)?",
        "WBC": r"WBC[:\s]*([\d,]+)\s*(/cmm|cmm)?",
        "Neutrophils": r"Neutrophils[:\s]*([\d.]+)\s*(%)?",
        "Lymphocytes": r"Lymphocytes[:\s]*([\d.]+)\s*(%)?",
        "Eosinophils": r"Eosinophils[:\s]*([\d.]+)\s*(%)?",
        "Monocytes": r"Monocytes[:\s]*([\d.]+)\s*(%)?",
        "Basophils": r"Basophils[:\s]*([\d.]+)\s*(%)?",
        "Platelets": r"Platelets[:\s]*([\d,]+)\s*(x10\^3/cmm|platelets)?"
    }

    extracted_values = {}
    for param, pattern in parameters.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).replace(",", "").replace("O", "0").replace("l", "1")  
            extracted_values[param] = float(value) if "." in value else int(value)

    return extracted_values


In [None]:

def process_medical_report(image_path, model, processor):
    """Extracts text, normalizes fields, extracts lab values, and processes via LayoutLMv2."""
    
    extracted_text = extract_text_from_image(image_path)
    extracted_text = normalize_patient_name(extracted_text)

    # Extract patient name
    match = re.search(r"Patient Name[:\s]*([A-Za-z\s]+)", extracted_text)
    patient_name = match.group(1).strip() if match else "Unknown"

    # Extract lab values
    lab_values = extract_lab_parameters(extracted_text)

    # Process with LayoutLMv2
    encoding = processor(
        images=[Image.open(image_path).convert("RGB")],
        text=[list(extracted_text.split())],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    model.eval()
    with torch.no_grad():
        outputs = model(**encoding)

    logits = outputs.logits
    predicted_ids = logits.argmax(dim=2).squeeze().tolist()

    return {
        "Patient Name": patient_name,
        **lab_values
    }


In [None]:

def save_to_csv(data, output_filepath):
    """Saves OCR extracted data + LayoutLMv2 structured results to CSV and provides download link."""
    with open(output_filepath, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Patient Name", "Parameter", "Value"])
        patient_name = data.pop("Patient Name", "Unknown")

        for param, value in data.items():
            writer.writerow([patient_name, param, value])

    print(f"✅ Saved results to: {output_filepath}")
    files.download(output_filepath)  # Auto-download in Google Colab


In [None]:

test_image_path = "./sample_report.png"
output_csv = "extracted_blood_report.csv"

ocr_result = process_medical_report(test_image_path, model, processor)
save_to_csv(ocr_result, output_csv)
