In [2]:
# Full integrated pipeline: PaddleOCR + TrOCR + Google Vision + BLIP + gTTS
# Paste & run in Google Colab.

# ---------- Install dependencies (uncomment + run once if needed) ----------
!pip install --quiet paddleocr transformers google-cloud-vision torch torchvision torchaudio pillow opencv-python matplotlib gtts

# ---------- Imports ----------
import os, io, time, base64
from PIL import Image
import cv2
import numpy as np
import torch
from IPython.display import Audio, display, HTML
from google.colab import files, output
from gtts import gTTS
from base64 import b64decode

# Transformer models (TrOCR + BLIP)
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, BlipProcessor, BlipForConditionalGeneration

# PaddleOCR: guarded import (may fail in some Colab environments; this is optional)
try:
    from paddleocr import PaddleOCR
    PADDLE_AVAILABLE = True
except Exception as e:
    print("PaddleOCR import failed or not installed; continuing without it. Error:", e)
    PADDLE_AVAILABLE = False

# ---------- Camera JS callback (for phone capture in Colab) ----------
_image_data_holder = {"data": None}
def _set_image_data(data_url):
    _image_data_holder["data"] = data_url
    return "OK"
output.register_callback('notebook.set_image', _set_image_data)

camera_js = """
async function captureAndSend() {
  const video = document.createElement('video');
  video.style.display = 'none';
  document.body.appendChild(video);
  const stream = await navigator.mediaDevices.getUserMedia({ video: {facingMode: 'environment'} });
  video.srcObject = stream;
  await video.play();

  const canvas = document.createElement('canvas');
  canvas.width = video.videoWidth;
  canvas.height = video.videoHeight;
  const ctx = canvas.getContext('2d');
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);

  stream.getTracks().forEach(t => t.stop());
  document.body.removeChild(video);
  const dataUrl = canvas.toDataURL('image/jpeg');
  google.colab.kernel.invokeFunction('notebook.set_image', [dataUrl], {});
  return dataUrl;
}
"""

# ---------- Google Vision setup helper ----------
def upload_and_init_google_clients():
    """
    Upload service account JSON and initialize Google Vision client.
    Returns: vision_client or None
    """
    # If GOOGLE_APPLICATION_CREDENTIALS already set and valid, try to init client directly
    if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
        try:
            from google.cloud import vision
            vision_client = vision.ImageAnnotatorClient()
            print("Google Vision client initialized using existing GOOGLE_APPLICATION_CREDENTIALS.")
            return vision_client
        except Exception as e:
            print("Existing GOOGLE_APPLICATION_CREDENTIALS did not work:", e)

    # Else prompt upload
    print("Please upload your Google service account JSON key file (needed for Vision API).")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return None
    # use first uploaded file
    keyfile = list(uploaded.keys())[0]
    # quick sanity check
    try:
        import json
        with open(keyfile, 'r') as fh:
            js = json.load(fh)
        # set env var
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = keyfile
        print("Set GOOGLE_APPLICATION_CREDENTIALS =", keyfile)
        # init
        from google.cloud import vision
        vision_client = vision.ImageAnnotatorClient()
        print("Google Vision client initialized ✅")
        return vision_client
    except Exception as e:
        print("Failed to initialize Google Vision client. Error:", e)
        print("Make sure the uploaded file is a Google service account JSON and that the Vision API is enabled for your project.")
        return None

# ---------- OCR / caption functions ----------
# Initialize models (TrOCR + BLIP). These download weights on first run.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
print("Loading TrOCR + BLIP models (may take a minute)...")
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
print("TrOCR + BLIP loaded.")

# PaddleOCR instantiate if available
paddle_ocr = None
if PADDLE_AVAILABLE:
    try:
        # use_textline_orientation argument is newer; use default language 'en'
        paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='en')
        print("PaddleOCR ready.")
    except Exception as e:
        print("PaddleOCR initialization failed at runtime, disabling it. Error:", e)
        paddle_ocr = None
        PADDLE_AVAILABLE = False

def run_paddle_ocr(image_path):
    """Return text (string) or ''"""
    if not PADDLE_AVAILABLE or paddle_ocr is None:
        return ""
    try:
        res = paddle_ocr.ocr(image_path, det=True, rec=True, cls=True)
    except Exception as e:
        print("PaddleOCR runtime error:", e)
        return ""
    extracted = []
    if res and isinstance(res, list):
        for page in res:
            if not page: continue
            for entry in page:
                if not entry or len(entry)<2: continue
                try:
                    txt = entry[1][0] if isinstance(entry[1], (list,tuple)) and len(entry[1])>0 else str(entry[1])
                except:
                    txt = str(entry[1])
                if txt:
                    extracted.append(txt)
    return " ".join(extracted).strip()

def run_trocr(image_rgb):
    pil = Image.fromarray(image_rgb)
    inputs = trocr_processor(images=pil, return_tensors="pt").to(device)
    with torch.no_grad():
        outs = trocr_model.generate(**inputs, max_length=256)
    txt = trocr_processor.batch_decode(outs, skip_special_tokens=True)[0]
    return txt.strip()

def run_blip_caption(image_rgb):
    pil = Image.fromarray(image_rgb)
    inputs = blip_processor(images=pil, return_tensors="pt").to(device)
    with torch.no_grad():
        out = blip_model.generate(**inputs, max_length=64)
    caption = blip_processor.decode(out[0], skip_special_tokens=True)
    return caption.strip()

def run_google_vision_ocr(vision_client, image_path):
    """
    vision_client: initialized google.cloud.vision.ImageAnnotatorClient
    returns full detected text (string) or ''
    """
    if vision_client is None:
        return ""
    try:
        with open(image_path, "rb") as f:
            content = f.read()
        from google.cloud import vision
        image = vision.Image(content=content)
        response = vision_client.text_detection(image=image)
        if getattr(response, "error", None) and getattr(response.error, "message", None):
            print("Google Vision API error:", response.error.message)
            return ""
        texts = response.text_annotations
        if not texts:
            return ""
        # texts[0].description is the full block
        return texts[0].description.strip()
    except Exception as e:
        print("Google Vision OCR error:", e)
        return ""

# ---------- TTS helper (gTTS used here) ----------
def text_to_speech_gtts(text, lang="en", out_path=None):
    if out_path is None:
        out_path = f"/content/tts_{int(time.time()*1000)}.mp3"
    try:
        tts = gTTS(text=text, lang=lang)
        tts.save(out_path)
        return out_path
    except Exception as e:
        print("gTTS error:", e)
        return None

# ---------- I/O and pipeline ----------
def save_dataurl_to_file(data_url, out_path="captured_image.jpg"):
    header, encoded = data_url.split(",", 1)
    data = base64.b64decode(encoded)
    with open(out_path, "wb") as f:
        f.write(data)
    return out_path

def read_image_rgb(path):
    img = cv2.imread(path)
    if img is None:
        raise FileNotFoundError(path)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# High-level pipeline with Google Vision integration.
# Set prefer_google_vision=True to try Vision first; otherwise Vision is used as fallback.
def process_image_with_vision_support(image_path, vision_client=None, prefer_google_vision=False, tts_lang="en"):
    """
    image_path: path to image
    vision_client: google vision client or None
    prefer_google_vision: If True try Google Vision first; else use it as fallback after Paddle/TrOCR
    tts_lang: language code for gTTS (e.g., 'en', 'hi')
    """
    image_rgb = read_image_rgb(image_path)
    final_text = ""
    source = None

    # Option A: Try Vision first if requested
    if prefer_google_vision and vision_client:
        gv = run_google_vision_ocr(vision_client, image_path)
        if gv and len(gv.strip())>=1:
            final_text = gv
            source = "google_vision"
            print("Google Vision OCR result (snippet):", final_text[:300])

    # Option B: Try PaddleOCR (if available)
    if not final_text:
        paddle_text = run_paddle_ocr(image_path)
        if paddle_text and len(paddle_text.strip())>=1:
            final_text = paddle_text
            source = "paddle_ocr"
            print("PaddleOCR result (snippet):", final_text[:300])

    # Option C: TrOCR (handwriting) if still empty
    if not final_text:
        try:
            tro = run_trocr(image_rgb)
            if tro and len(tro.strip())>=1:
                final_text = tro
                source = "trocr"
                print("TrOCR result (snippet):", final_text[:300])
        except Exception as e:
            print("TrOCR error:", e)

    # Option D: Google Vision as fallback (if not used first)
    if not final_text and vision_client:
        gv = run_google_vision_ocr(vision_client, image_path)
        if gv and len(gv.strip())>=1:
            final_text = gv
            source = "google_vision"
            print("Google Vision OCR result (snippet):", final_text[:300])

    # Option E: BLIP caption fallback
    if not final_text:
        try:
            caption = run_blip_caption(image_rgb)
            final_text = caption
            source = "blip_caption"
            print("BLIP caption (fallback):", caption)
        except Exception as e:
            print("BLIP error:", e)
            final_text = ""
            source = "none"

    # If still nothing, final_text will be empty string
    if final_text:
        # Use gTTS to produce audio (you can replace with Cloud TTS later)
        audio_path = text_to_speech_gtts(final_text, lang=tts_lang)
        return {"text": final_text, "source": source, "audio_path": audio_path}
    else:
        return {"text": "", "source": source, "audio_path": None}

# ---------- UI helpers (camera / upload) ----------
def choose_input_method():
    print("\n📸 Choose an option:")
    print("1) Use Phone Camera (in-browser)")
    print("2) Upload Image Manually")
    choice = input("Enter 1 or 2: ").strip()
    return choice

def capture_image_from_phone_js(timeout=60):
    display(HTML("<script>{}</script>".format(camera_js)))
    display(HTML("<button onclick='captureAndSend()'>Capture from Camera</button>"))
    print("Click the 'Capture from Camera' button; allow camera permission. Waiting up to {}s...".format(timeout))
    import time
    for _ in range(timeout*2):
        if _image_data_holder["data"]:
            data_url = _image_data_holder["data"]
            _image_data_holder["data"] = None
            out = save_dataurl_to_file(data_url)
            print("Saved to:", out)
            return out
        time.sleep(0.5)
    raise TimeoutError("Timed out waiting for camera capture.")

def upload_image_manually():
    uploaded = files.upload()
    for fname in uploaded.keys():
        print("Uploaded:", fname)
        return fname
    return None

# ---------- Run loop example ----------
# 1) Initialize Google Vision client (upload key when prompted). If you don't want Vision, set vision_client=None.
vision_client = None
enable_vision = input("Do you want to use Google Vision API? (y/N): ").strip().lower() == 'y'
if enable_vision:
    vision_client = upload_and_init_google_clients()
    if vision_client is None:
        print("Google Vision client not available; continuing without it.")

# 2) Processing loop (single iteration or loop as you prefer)
while True:
    choice = choose_input_method()
    if choice == '1':
        try:
            image_path = capture_image_from_phone_js()
        except Exception as e:
            print("Camera capture failed:", e)
            image_path = None
    elif choice == '2':
        image_path = upload_image_manually()
    else:
        print("Invalid choice")
        continue

    if not image_path:
        print("No image provided; try again.")
        continue

    # Ask user if they prefer Vision first or fallback (optional)
    pref = input("Prefer Google Vision first when available? (y/N): ").strip().lower() == 'y'
    tts_lang = input("Enter TTS language code for gTTS (e.g., 'en' or 'hi') or press Enter for 'en': ").strip() or "en"
    print("Processing image...")
    out = process_image_with_vision_support(image_path, vision_client=vision_client, prefer_google_vision=pref, tts_lang=tts_lang)
    print("\n== Result ==")
    print("Source:", out["source"])
    print("Text (first 500 chars):", out["text"][:500] if out["text"] else "<no text>")
    if out["audio_path"]:
        print("Playing audio...")
        display(Audio(out["audio_path"], autoplay=True))
    else:
        print("No audio generated.")

    again = input("Process another image? (y/N): ").strip().lower() == 'y'
    if not again:
        break

print("Done.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.9/527.9 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.7/68.7 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

TrOCR + BLIP loaded.
PaddleOCR initialization failed at runtime, disabling it. Error: No module named 'paddle'
Do you want to use Google Vision API? (y/N): y
Please upload your Google service account JSON key file (needed for Vision API).


Saving capable-reserve-455210-r4-18a8195a9e1c.json to capable-reserve-455210-r4-18a8195a9e1c.json
Set GOOGLE_APPLICATION_CREDENTIALS = capable-reserve-455210-r4-18a8195a9e1c.json
Google Vision client initialized ✅

📸 Choose an option:
1) Use Phone Camera (in-browser)
2) Upload Image Manually
Enter 1 or 2: 2


Saving handwritten text.jpg to handwritten text.jpg
Uploaded: handwritten text.jpg
Prefer Google Vision first when available? (y/N): y
Enter TTS language code for gTTS (e.g., 'en' or 'hi') or press Enter for 'en': en
Processing image...
Google Vision OCR result (snippet): We Start With Good
Because all businesses should
be
doing something good.

== Result ==
Source: google_vision
Text (first 500 chars): We Start With Good
Because all businesses should
be
doing something good.
Playing audio...


Process another image? (y/N): y

📸 Choose an option:
1) Use Phone Camera (in-browser)
2) Upload Image Manually
Enter 1 or 2: 2


Saving sample handwritten text2.jpg to sample handwritten text2.jpg
Uploaded: sample handwritten text2.jpg
Prefer Google Vision first when available? (y/N): y
Enter TTS language code for gTTS (e.g., 'en' or 'hi') or press Enter for 'en': en
Processing image...
Google Vision OCR result (snippet): Hello,
Simply Noted has developed
incredible proprietary robotic
technology to write your message
and envelopes with
germine
It is completely
indistinguishable from a humans
real pen
handwriting
Try us
Simply Noted
today!

== Result ==
Source: google_vision
Text (first 500 chars): Hello,
Simply Noted has developed
incredible proprietary robotic
technology to write your message
and envelopes with
germine
It is completely
indistinguishable from a humans
real pen
handwriting
Try us
Simply Noted
today!
Playing audio...


Process another image? (y/N): y

📸 Choose an option:
1) Use Phone Camera (in-browser)
2) Upload Image Manually
Enter 1 or 2: 2


Saving sample handwritten3.jpg to sample handwritten3.jpg
Uploaded: sample handwritten3.jpg
Prefer Google Vision first when available? (y/N): y
Enter TTS language code for gTTS (e.g., 'en' or 'hi') or press Enter for 'en': hi
Processing image...
Google Vision OCR result (snippet): कक्षाकार्य)
रा
क्रिया
कर्म के अनुसार क्रिया के भेद -
अकर्मक :- वह क्रिया, जिसमें क्रिया के काम का प्रभाव
पड़ता है।
कर्त्ता पर ही
जैसे- खिलाड़ी दौड़ रहे हैं।
मोहन सोया है।
सकर्मक:- वह क्रिया,
जिसमें क्रिया के काम का प्रभाव
कर्त्ता पर पड़कर कर्म पर पड़े, वह सकर्मक क्रिया होती
है। इसके प्रयोग में कर्म की आ

== Result ==
Source: google_vision
Text (first 500 chars): कक्षाकार्य)
रा
क्रिया
कर्म के अनुसार क्रिया के भेद -
अकर्मक :- वह क्रिया, जिसमें क्रिया के काम का प्रभाव
पड़ता है।
कर्त्ता पर ही
जैसे- खिलाड़ी दौड़ रहे हैं।
मोहन सोया है।
सकर्मक:- वह क्रिया,
जिसमें क्रिया के काम का प्रभाव
कर्त्ता पर पड़कर कर्म पर पड़े, वह सकर्मक क्रिया होती
है। इसके प्रयोग में कर्म की आवश्यकता होती है। यह
कर्म के बिना अपना भाव पुरी तरह प्रकट 

Process another image? (y/N): n
Done.
