In [None]:
!pip install tensorflow keras numpy matplotlib pillow opencv-python
!pip install torch torchvision torchaudio
!pip install nltk spacy
!pip install pycocotools




In [None]:
import os
import urllib.request
import zipfile

# Create a folder
if not os.path.exists("coco_dataset"):
    os.makedirs("coco_dataset")

# Download small MS COCO subset (2017 val 2k)
url_images = "http://images.cocodataset.org/zips/val2017.zip"
url_annotations = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"

def download_and_extract(url, extract_to):
    file_name = url.split("/")[-1]
    urllib.request.urlretrieve(url, file_name)
    with zipfile.ZipFile(file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Download images and annotations
download_and_extract(url_images, "coco_dataset")
download_and_extract(url_annotations, "coco_dataset")

print("‚úÖ Dataset downloaded & extracted!")


‚úÖ Dataset downloaded & extracted!


In [None]:
import json
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image

# --- Load COCO annotations ---
ann_file = "coco_dataset/annotations/captions_val2017.json"
with open(ann_file, 'r') as f:
    annotations = json.load(f)

# Map images to their captions
image_captions = {}
for ann in annotations['annotations']:
    img_id = ann['image_id']
    caption = ann['caption']
    if img_id not in image_captions:
        image_captions[img_id] = []
    image_captions[img_id].append(caption.lower())

print(f"‚úÖ Total images with captions: {len(image_captions)}")

# --- Tokenizer for captions ---
all_captions = []
for caps in image_captions.values():
    all_captions.extend(caps)

tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(all_captions)

vocab_size = len(tokenizer.word_index) + 1
print(f"‚úÖ Vocab size: {vocab_size}")

max_length = max(len(c.split()) for c in all_captions)
print(f"‚úÖ Max caption length: {max_length}")


‚úÖ Total images with captions: 5000
‚úÖ Vocab size: 7268
‚úÖ Max caption length: 50


In [None]:
# ‚úÖ Ensure eager execution is enabled
tf.compat.v1.enable_eager_execution()

# ‚úÖ Re-define the ResNet50 model
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

base_model = ResNet50(weights='imagenet')
resnet_model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    # ‚úÖ Use .numpy() to get actual array in eager mode
    feature = resnet_model(x, training=False).numpy()
    return feature

# Test feature extraction
sample_img_path = "coco_dataset/val2017/000000397133.jpg"
feature_test = extract_features(sample_img_path)
print("‚úÖ Feature shape:", feature_test.shape)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
[1m102967424/102967424[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 0us/step
‚úÖ Feature shape: (1, 2048)


In [None]:
!pip install tensorflow keras numpy matplotlib pillow opencv-python
!pip install nltk
!pip install pycocotools

import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import matplotlib.pyplot as plt
from PIL import Image
import urllib.request, zipfile, os, json




In [None]:
if not os.path.exists("coco_dataset"):
    os.makedirs("coco_dataset")

url_images = "http://images.cocodataset.org/zips/val2017.zip"
url_annotations = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"

def download_and_extract(url, extract_to):
    file_name = url.split("/")[-1]
    if not os.path.exists(file_name):
        urllib.request.urlretrieve(url, file_name)
    with zipfile.ZipFile(file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

download_and_extract(url_images, "coco_dataset")
download_and_extract(url_annotations, "coco_dataset")

print("‚úÖ Dataset ready!")


‚úÖ Dataset ready!


In [None]:
base_model = ResNet50(weights='imagenet')
resnet_model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    feature = resnet_model.predict(x, verbose=0)
    return feature


In [None]:
!pip install gdown

# Download pre-trained weights & tokenizer
!gdown 1t97qXl4UYZn1pNnOe5i3VVZJ7S3mA7T0   # model_caption.h5
!gdown 1UdJvl0S5OT8Z4m_EHQV3NR1A5-VbAGDT   # tokenizer_caption.pkl

caption_model = load_model("model_caption.h5")
with open("tokenizer_caption.pkl", "rb") as f:
    tokenizer = pickle.load(f)

max_length = 34  # fixed during training
print("‚úÖ Pre-trained captioning model loaded!")


Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1t97qXl4UYZn1pNnOe5i3VVZJ7S3mA7T0

but Gdown can't. Please check connections and permissions.
Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1UdJvl0S5OT8Z4m_EHQV3NR1A5-VbAGDT

but Gdown can't. Please check connections and permissions.


OSError: Unable to synchronously open file (file signature not found)

In [None]:
!pip install transformers sentencepiece





In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

# Load BLIP model (pre-trained for image captioning)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

print("‚úÖ BLIP Captioning model loaded!")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

‚úÖ BLIP Captioning model loaded!


In [None]:
def generate_caption_blip(image_path):
    raw_image = Image.open(image_path).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Test with a COCO image
test_img = "coco_dataset/val2017/000000397133.jpg"
Image.open(test_img).show()

print("üìù Caption:", generate_caption_blip(test_img))


FileNotFoundError: [Errno 2] No such file or directory: 'coco_dataset/val2017/000000397133.jpg'

In [None]:
!pip install torch torchvision torchaudio




In [None]:
import torchvision
import torchvision.transforms as T

# Load pre-trained Mask R-CNN (COCO trained)
seg_model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
seg_model.eval()

# COCO class labels (91 classes)
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table',
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]




Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 170M/170M [00:02<00:00, 84.8MB/s]


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

transform = T.Compose([T.ToTensor()])

def get_segmented_image(img_path, threshold=0.5):
    img = Image.open(img_path).convert("RGB")
    img_tensor = transform(img)

    with torch.no_grad():
        prediction = seg_model([img_tensor])

    masks = prediction[0]['masks']
    labels = prediction[0]['labels']
    scores = prediction[0]['scores']

    img_np = np.array(img)

    for i in range(len(masks)):
        if scores[i] > threshold:
            mask = masks[i, 0].mul(255).byte().cpu().numpy()
            color = np.random.randint(0, 255, (3,), dtype=np.uint8)
            img_np[mask > 128] = img_np[mask > 128] * 0.5 + color * 0.5

    plt.figure(figsize=(8,8))
    plt.imshow(img_np)
    plt.axis('off')
    plt.show()

    return prediction


In [None]:
def caption_and_segment(img_path):
    # Caption
    caption = generate_caption_blip(img_path)
    print(f"üìù Caption: {caption}")

    # Segmentation
    get_segmented_image(img_path)

# Test it
caption_and_segment(test_img)


FileNotFoundError: [Errno 2] No such file or directory: 'coco_dataset/val2017/000000397133.jpg'