In [1]:
import cv2
import os
import numpy as np
import glob
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
from PIL import Image
import torch
import re

In [2]:
from IPython.display import display
from ipywidgets import FileUpload

upload_widget = FileUpload()
display(upload_widget)

# Once a file is uploaded, you can access it via:
uploaded_file = upload_widget.value
print(uploaded_file)


FileUpload(value=(), description='Upload')

()


In [9]:
video_path = "C:\\Users\\gunda\\OneDrive\\Desktop\\Videosummarizer\\myenv\\video2.mp4" 
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("Error: Could not open video file. Check the file path!")
else:
    print("Video loaded successfully!")
cap.release()

Video loaded successfully!


In [10]:
# Extract Frames
os.makedirs("frames", exist_ok=True)
cap = cv2.VideoCapture(video_path)
frame_count = 0
frame_skip = 10  # Extract every 10th frame
while True:
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % frame_skip == 0:
        frame_filename = f"frames/frame_{frame_count}.jpg"
        cv2.imwrite(frame_filename, frame)
    frame_count += 1
cap.release()
print(f"Extracted {frame_count//frame_skip} frames and saved in 'frames' folder.")



Extracted 453 frames and saved in 'frames' folder.


In [11]:
# Extract Keyframes based on Scene Change
os.makedirs("keyframes", exist_ok=True)
cap = cv2.VideoCapture(video_path)
ret, prev_frame = cap.read()
if not ret:
    print("Error: Could not read first frame.")
    cap.release()
    exit()
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
frame_count = 0
keyframe_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    diff = cv2.absdiff(prev_gray, gray)
    diff_score = np.sum(diff) / (diff.shape[0] * diff.shape[1])
    if diff_score > 10:
        keyframe_count += 1
        keyframe_filename = f"keyframes/keyframe_{keyframe_count}.jpg"
        cv2.imwrite(keyframe_filename, frame)
        prev_gray = gray
cap.release()
print(f"Total keyframes saved: {keyframe_count}")



Total keyframes saved: 334


In [12]:
# Clustering Keyframes
keyframes_dir = "keyframes"
keyframes = []
keyframe_paths = sorted([os.path.join(keyframes_dir, f) for f in os.listdir(keyframes_dir)])
for path in keyframe_paths:
    img = cv2.imread(path)
    if img is not None:
        keyframes.append(img)
features = []
for img in keyframes:
    hist = cv2.calcHist([img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    features.append(hist.flatten())
features = np.array(features)
from sklearn.cluster import KMeans
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(features)
selected_keyframes = []
for cluster in range(num_clusters):
    indices = np.where(labels == cluster)[0]
    representative_idx = indices[0]
    selected_keyframes.append(keyframes[representative_idx])
output_dir = "selected_keyframes"
os.makedirs(output_dir, exist_ok=True)
for i, img in enumerate(selected_keyframes):
    cv2.imwrite(f"{output_dir}/keyframe_{i}.jpg", img)
print(f"Selected {num_clusters} keyframes and saved in '{output_dir}'")



[WinError 2] The system cannot find the file specified
  File "C:\Users\gunda\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Selected 10 keyframes and saved in 'selected_keyframes'


In [13]:
# Generate Captions
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
captions = {}
keyframe_paths = sorted([os.path.join(output_dir, f) for f in os.listdir(output_dir)])
for path in keyframe_paths:
    image = Image.open(path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions[path] = caption
    print(f"Caption for {path}: {caption}")
with open("captions.txt", "w") as f:
    for key, value in captions.items():
        f.write(f"{key}: {value}\n")



preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Caption for selected_keyframes\keyframe_0.jpg: a screen shot of a map with a red marker
Caption for selected_keyframes\keyframe_1.jpg: a person holding a phone with a map on it
Caption for selected_keyframes\keyframe_2.jpg: a screen shot of a man and woman riding bikes
Caption for selected_keyframes\keyframe_3.jpg: a website with a picture of people riding bikes
Caption for selected_keyframes\keyframe_4.jpg: a screen shot of a person riding a bike
Caption for selected_keyframes\keyframe_5.jpg: a person holding a phone with a map on it
Caption for selected_keyframes\keyframe_6.jpg: a screen shot of a website with a map and a person on a bike
Caption for selected_keyframes\keyframe_7.jpg: a website with a picture of a person riding a bike
Caption for selected_keyframes\keyframe_8.jpg: a screenshote website with a map and location
Caption for selected_keyframes\keyframe_9.jpg: cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle cycle

In [15]:
import re
from transformers import pipeline

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Read captions from file
with open("captions.txt", "r") as f:
    lines = f.readlines()

# Function to clean and structure captions
def clean_caption(caption):
    caption = re.sub(r'\b(\w+)( \1\b)+', r'\1', caption)  # Remove repeated words
    caption = caption.capitalize().strip()
    return caption

# Convert captions into a structured paragraph
story_text = "The video captures a sequence of events showcasing urban cycling and navigation. "  # Introduction

for line in lines:
    parts = line.split(":")
    if len(parts) > 1:
        caption = clean_caption(parts[1].strip())
        story_text += f" {caption}. "

# Generate a better summary
summary = summarizer(story_text, max_length=80, min_length=30, do_sample=False)[0]["summary_text"]

# Save summary to file
with open("video_summary.txt", "w") as f:
    f.write(summary)

print("✅ Video Summary Generated:")
print(summary)


Device set to use cpu


✅ Video Summary Generated:
Video captures a sequence of events showcasing urban cycling and navigation. A screen shot of a map with a red marker. A person holding a phone with a map on it. A website with a picture of people riding bikes.
