In [None]:
# Step 1: Install Dependencies (if not installed)
!pip install paddlepaddle paddleocr transformers google-cloud-vision torch torchaudio librosa numpy opencv-python matplotlib gtts

# Step 2: Import Required Libraries
import cv2
import numpy as np
import torch
import librosa
import torchaudio
from paddleocr import PaddleOCR
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, BlipProcessor, BlipForConditionalGeneration
from google.cloud import vision
from gtts import gTTS
from IPython.display import Audio, display, HTML
import matplotlib.pyplot as plt
from google.colab import files
from base64 import b64decode

# Step 3: Allow User to Choose Input Method
def choose_input_method():
    print("\n📸 Choose an option:")
    print("1️⃣ Use Phone Camera")
    print("2️⃣ Upload Image Manually")
    choice = input("Enter your choice (1 or 2): ")
    return choice

# Capture image from phone camera
def capture_image_from_phone():
    print("📷 Scan the QR code below to connect your phone camera.")
    display(HTML('''
    <script>
    async function capture() {
        const video = document.createElement('video');
        video.style.display = 'none';
        document.body.appendChild(video);
        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
        video.srcObject = stream;
        await video.play();
        const canvas = document.createElement('canvas');
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        canvas.getContext('2d').drawImage(video, 0, 0);
        stream.getVideoTracks()[0].stop();
        document.body.removeChild(video);
        return canvas.toDataURL('image/jpeg');
    }
    capture().then(dataUrl => {
        google.colab.kernel.invokeFunction('notebook.set_image', [dataUrl], {});
    });
    </script>
    '''))
    return eval_js('capture()')

# Upload image manually
def upload_image_manually():
    uploaded = files.upload()
    for filename in uploaded.keys():
        return filename

# Main function to get image
def get_image():
    while True:
        choice = choose_input_method()
        if choice == '1':
            image_data = capture_image_from_phone()
            image_bytes = b64decode(image_data.split(',')[1])
            with open('captured_image.jpg', 'wb') as f:
                f.write(image_bytes)
            return 'captured_image.jpg'
        elif choice == '2':
            return upload_image_manually()
        else:
            print("❌ Invalid choice. Please enter 1 or 2.")

# Step 4: Load OCR and AI Models
paddle_ocr = PaddleOCR()
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Extract text using PaddleOCR
def extract_text(image_path):
    result = paddle_ocr.ocr(image_path, cls=True)
    extracted_text = ""
    # Check if result is valid and not None
    if result and isinstance(result, list) and all(isinstance(line, list) for line in result):
        for line in result:
            # Check if line is valid and not None
            if line and isinstance(line, list):
                for word_info in line:
                    # Ensure word_info has necessary elements before accessing indices
                    if word_info and isinstance(word_info, list) and len(word_info) > 1 and len(word_info[1]) > 0:
                        extracted_text += word_info[1][0] + " "

    return extracted_text.strip()

# Generate AI-based image description using BLIP
def generate_description(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    inputs = blip_processor(image, return_tensors="pt")
    out = blip_model.generate(**inputs)
    description = blip_processor.decode(out[0], skip_special_tokens=True)
    return description

# Convert text to speech
def text_to_speech(text, output_path="output.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(output_path)
    return output_path

# Process image (OCR + AI Description)
def process_image(image_path):
    extracted_text = extract_text(image_path)

    if extracted_text:
        print(f"✅ Text detected: {extracted_text}")
        audio_file = text_to_speech(extracted_text)
    else:
        print("⚠️ No text detected! Switching to AI image description...")
        description = generate_description(image_path)
        print(f"🖼️ Image Description: {description}")
        audio_file = text_to_speech(description)

    return audio_file

# Step 5: Run the Pipeline (Loop Until User Exits)
while True:
    image_path = get_image()

    if image_path:
        print("\n🔄 Processing your image...")
        audio_file = process_image(image_path)

        if audio_file:
            display(Audio(audio_file, autoplay=True))  # Play generated speech
        else:
            print("❌ No valid output generated.")

    else:
        print("❌ No image uploaded! Please try again.")

    # Ask the user if they want to upload another image
    another = input("\n📸 Do you want to upload another image? (yes/no): ").strip().lower()

    if another not in ["yes", "y"]:
        print("👋 Exiting. Have a great day!")
        break


[2025/04/21 06:10:21] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_l

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod


📸 Choose an option:
1️⃣ Use Phone Camera
2️⃣ Upload Image Manually
Enter your choice (1 or 2): 2


Saving sample 1.png to sample 1.png

🔄 Processing your image...
[2025/04/21 06:10:46] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.09505653381347656
[2025/04/21 06:10:47] ppocr DEBUG: rec_res num  : 5, elapsed : 0.6207537651062012
✅ Text detected: Itwasthebestof times,it was the worst of times,it was the age ofwisdom,itwasthe age offoolishness...



📸 Do you want to upload another image? (yes/no): yes

📸 Choose an option:
1️⃣ Use Phone Camera
2️⃣ Upload Image Manually
Enter your choice (1 or 2): 2


Saving description.png to description.png

🔄 Processing your image...
[2025/04/21 06:12:11] ppocr DEBUG: dt_boxes num : 0, elapsed : 0.06845331192016602
[2025/04/21 06:12:11] ppocr DEBUG: rec_res num  : 0, elapsed : 1.9073486328125e-06
⚠️ No text detected! Switching to AI image description...
🖼️ Image Description: a boy riding a bike



📸 Do you want to upload another image? (yes/no): yes

📸 Choose an option:
1️⃣ Use Phone Camera
2️⃣ Upload Image Manually
Enter your choice (1 or 2): 2


Saving handwritten sample 2.jpg to handwritten sample 2 (1).jpg

🔄 Processing your image...
[2025/04/21 06:12:53] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.0736842155456543
[2025/04/21 06:12:53] ppocr DEBUG: rec_res num  : 3, elapsed : 0.2933847904205322
✅ Text detected: Thisisahandwritten example Write as good asyou can.



📸 Do you want to upload another image? (yes/no): yes

📸 Choose an option:
1️⃣ Use Phone Camera
2️⃣ Upload Image Manually
Enter your choice (1 or 2): 2


Saving traffic signal.jpg to traffic signal.jpg

🔄 Processing your image...
[2025/04/21 06:18:16] ppocr DEBUG: dt_boxes num : 0, elapsed : 0.0863957405090332
[2025/04/21 06:18:16] ppocr DEBUG: rec_res num  : 0, elapsed : 2.1457672119140625e-06
⚠️ No text detected! Switching to AI image description...
🖼️ Image Description: a traffic light



📸 Do you want to upload another image? (yes/no): no
👋 Exiting. Have a great day!
