In [None]:
!pip install tensorflow opencv-python



In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
import cv2
import numpy as np
import base64

def capture_frame():
    """
    Captures a single frame from the webcam using JavaScript.
    """
    js = """
        async function captureFrame() {
            const video = document.createElement('video');
            const stream = await navigator.mediaDevices.getUserMedia({ video: true });
            video.srcObject = stream;
            await video.play();

            // Capture the frame
            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            stream.getTracks().forEach(track => track.stop());
            return canvas.toDataURL('image/jpeg', 0.8);
        }
        captureFrame();
    """
    frame_data = eval_js(js)
    return frame_data


In [None]:
def decode_frame(frame_data):
    """
    Decodes a Base64-encoded frame into an OpenCV image.
    """
    frame_bytes = base64.b64decode(frame_data.split(',')[1])
    np_frame = np.frombuffer(frame_bytes, dtype=np.uint8)
    frame = cv2.imdecode(np_frame, cv2.IMREAD_COLOR)
    return frame


In [None]:
def preprocess_image(frame, feature_extractor_model):
    """
    Preprocesses a single frame and extracts features using the feature extractor model.
    """
    frame_resized = cv2.resize(frame, (224, 224)) / 255.0  # Resize and normalize
    frame_expanded = np.expand_dims(frame_resized, axis=0)  # Add batch dimension
    return feature_extractor_model.predict(frame_expanded)

def generate_caption_from_frame(frame, feature_extractor, caption_model, tokenizer, max_length):
    """
    Generates a caption for a given frame.
    """
    image_features = preprocess_image(frame, feature_extractor)
    caption = predict_caption(caption_model, image_features, tokenizer, max_length)
    return caption


In [None]:
import json
# Load tokenizer from JSON
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open('tokenizer.json', 'r') as f:
    tokenizer_json = json.load(f)
loaded_tokenizer = tokenizer_from_json(tokenizer_json)

In [None]:
def predict_caption(model, image_features, tokenizer, max_length):
    """Generate caption for a given image using the trained model."""
    input_seq = tokenizer.texts_to_sequences(['<start>'])[0]
    input_seq = pad_sequences([input_seq], maxlen=max_length, padding='post')

    result_caption = []
    for _ in range(max_length):
        predictions = model.predict([image_features, input_seq], verbose=0)
        predicted_id = np.argmax(predictions[0, len(result_caption)])
        word = tokenizer.index_word.get(predicted_id, "<unk>")

        # Stop if <end> token is generated
        if word == '<end>':
            break

        # Stop if <unk> is generated and there's no meaningful output
        if word == "<unk>" and len(result_caption) > 0:
            break

        result_caption.append(word)
        input_seq[0, len(result_caption)] = predicted_id

    # Return the caption, omitting the <start> token
    return ' '.join(result_caption)

In [None]:
from tensorflow.keras.applications import ResNet50

# Initialize the feature extractor model (e.g., ResNet50)
feature_extractor = ResNet50(weights="imagenet", include_top=False, pooling="avg")

# Load your trained image captioning model
caption_model = load_model('/content/caption_model.keras')

# Initialize the tokenizer and max_length (as used during training)
tokenizer = loaded_tokenizer  # Load your tokenizer
max_length = 40  # Set this to the max sequence length used during training

# Loop to capture and process frames
while True:
    try:
        # Capture a frame
        frame_data = capture_frame()
        frame = decode_frame(frame_data)

        # Generate a caption
        caption = generate_caption_from_frame(frame, feature_extractor, caption_model, tokenizer, max_length)

        # Display the frame and caption
        print(f"Caption: {caption}")
    except KeyboardInterrupt:
        print("Stopping frame capture.")
        break


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
Caption: people in a red shirt and black and white and white
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step
Caption: people in a red shirt and black and white and white
Stopping frame capture.
