In [1]:
# Install required libraries
!pip install transformers torchvision torchaudio imageio[ffmpeg] --quiet

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import imageio
import torch
import os

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").eval()

# Captioning function
def extract_and_caption(video_path, frame_num=5):
    reader = imageio.get_reader(video_path, 'ffmpeg')
    try:
        frame = reader.get_data(frame_num)
    except IndexError:
        frame = reader.get_data(0)  # fallback to first frame
    image = Image.fromarray(frame).convert('RGB')

    inputs = processor(image, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Upload videos
from google.colab import files
uploaded = files.upload()  # Upload of all 10 videos

# Caption
captions = []
for filename in uploaded.keys():
    print(f"Processing {filename}...")
    cap = extract_and_caption(filename)
    print(f">> {cap}")
    captions.append(f"{filename} - {cap}")

# Save to file
with open("generated_captions.txt", "w") as f:
    for line in captions:
        f.write(line + "\n")

print("\nAll captions saved to generated_captions.txt")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Saving SoccerPenalty.avi to SoccerPenalty.avi
Saving ApplyEyeMakeup.avi to ApplyEyeMakeup.avi
Saving CuttingInKitchen.avi to CuttingInKitchen.avi
Saving HandStandPushups.avi to HandStandPushups.avi
Saving PizzaTossing.avi to PizzaTossing.avi
Saving Typing.avi to Typing.avi
Saving WalkingWithDog.avi to WalkingWithDog.avi
Saving WritingOnBoard.avi to WritingOnBoard.avi
Saving YoYo.avi to YoYo.avi
Saving UnevenBars (1).avi to UnevenBars (1).avi
Processing SoccerPenalty.avi...
>> a tv screen showing a soccer game
Processing ApplyEyeMakeup.avi...
>> a woman with long black hair
Processing CuttingInKitchen.avi...
>> a person cutting a piece of paper with a knife
Processing HandStandPushups.avi...
>> a group of people standing around a red carpet
Processing PizzaTossing.avi...
>> a man in a kitchen
Processing Typing.avi...
>> a person is playing a video game on a computer
Processing WalkingWithDog.avi...
>> a dog walking down a dirt road in the woods
Processing WritingOnBoard.avi...
>> a man 