In [None]:
# model_training.py
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer  # CLIP wrapper
from glob import glob
from sklearn.metrics import mean_squared_error

In [None]:
#! pip install -U sentence-transformers
! pip install -U transformers==4.44.2
! pip install -U sentence-transformers


In [None]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# 1) Dataset wrapper (expects pre-extracted frames per video and a label file)
class TVSumFramesDataset(Dataset):
    def __init__(self, list_of_videos, embedding_model):
        # list_of_videos: list of dicts: {"frames": [paths...], "scores": [float...]}
        self.data = list_of_videos
        self.embedder = embedding_model

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        frames = entry["frames"]  # ordered list of image paths
        scores = np.array(entry["scores"], dtype=np.float32)  # same length
        # extract embeddings for frames (batch encode)
        imgs = [Image.open(p).convert('RGB') for p in frames]
        embs = self.embedder.encode(imgs, convert_to_tensor=True).cpu().numpy()  # (T, D)
        return torch.tensor(embs, dtype=torch.float32), torch.tensor(scores, dtype=torch.float32)


In [None]:
# 2) Temporal regressor (BiLSTM + MLP)
class TemporalRegressor(nn.Module):
    def __init__(self, input_dim=512, hidden=256, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden*2, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 1),
            nn.Sigmoid()  # output normalized 0..1
        )

    def forward(self, x):  # x: (B, T, D)
        out, _ = self.lstm(x)  # (B, T, 2*hidden)
        scores = self.fc(out).squeeze(-1)  # (B, T)
        return scores


In [None]:
# 3) Simple training loop
def collate_fn(batch):
    # Pads sequences to max length in batch
    embs, scores = zip(*batch)
    lengths = [e.shape[0] for e in embs]
    maxlen = max(lengths)
    D = embs[0].shape[1]
    emb_pad = torch.zeros(len(embs), maxlen, D, dtype=torch.float32)
    score_pad = torch.zeros(len(embs), maxlen, dtype=torch.float32)
    mask = torch.zeros(len(embs), maxlen, dtype=torch.bool)
    for i,(e,s) in enumerate(zip(embs,scores)):
        L = e.shape[0]
        emb_pad[i,:L] = e
        score_pad[i,:L] = s
        mask[i,:L] = 1
    return emb_pad.to(DEVICE), score_pad.to(DEVICE), mask.to(DEVICE)


def train(dataset, epochs=10, batch_size=4, lr=1e-4):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    model = TemporalRegressor(input_dim=dataset[0][0].shape[1]).to(DEVICE)
    opt = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss(reduction='none')
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for embs, scores, mask in loader:
            pred = model(embs)  # (B,T)
            loss_map = loss_fn(pred, scores) * mask.float()
            loss = loss_map.sum() / mask.float().sum()
            opt.zero_grad(); loss.backward(); opt.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} loss: {epoch_loss/len(loader):.4f}")
    return model


In [None]:

# 4) Inference: get scores, smooth, pick peaks
import scipy.signal as signal
def select_keyframes(embs, model, top_k=5, smooth_win=5):
    # embs: numpy (T, D) or tensor
    model.eval()
    with torch.no_grad():
        x = torch.tensor(embs, dtype=torch.float32).unsqueeze(0).to(DEVICE)
        scores = model(x).cpu().numpy().squeeze(0)  # (T,)
    # smooth
    scores_s = np.convolve(scores, np.ones(smooth_win)/smooth_win, mode='same')
    # find peaks (local maxima)
    peaks, _ = signal.find_peaks(scores_s, distance= max(1,int(len(scores_s)/ (top_k*2))))
    # if not enough peaks, take top-k by score
    if len(peaks) < top_k:
        idxs = np.argsort(scores_s)[-top_k:]
    else:
        idxs = peaks[np.argsort(scores_s[peaks])][-top_k:]
    idxs = np.sort(idxs)
    return idxs, scores_s

# -------------------------
# Example usage (high level):
# 1) Build `list_of_videos` for TVSum: each entry has frame paths and TVSum human scores aligned to sampled frames.
# 2) embedder = SentenceTransformer('clip-ViT-B-32')
# 3) dataset = TVSumFramesDataset(list_of_videos, embedder)
# 4) model = train(dataset)
# 5) For new video: sample frames -> embed with embedder -> select_keyframes(embs, model)


new pipeline working 


In [None]:
# just try build in and atlest generate some thing 

In [None]:
import cv2, os

video_path = "Lion vs. Wildebeest_ How Lions Hunt as a Pride (1).mp4"
output_dir = "frames_all"
os.makedirs(output_dir, exist_ok=True)

cap = cv2.VideoCapture(video_path)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Total frames: {frame_count}")

count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    path = os.path.join(output_dir, f"frame_{count:05d}.jpg")
    cv2.imwrite(path, frame)
    count += 1

cap.release()
print(f"‚úÖ Extracted {count} frames to {output_dir}/")


Total frames: 5638
‚úÖ Extracted 5638 frames to frames_all/


In [None]:
import torch, open_clip, os
from PIL import Image

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

img_path = "frames/frame_001.jpg"  # now this exists
img = Image.open(img_path).convert("RGB")
img_tensor = preprocess(img).unsqueeze(0).to(device)

with torch.no_grad():
    emb = model.encode_image(img_tensor)
    emb = emb / emb.norm(dim=-1, keepdim=True)
    emb = emb.cpu().numpy().squeeze(0)

print("‚úÖ Frame embedded successfully! Shape:", emb.shape)




‚úÖ Frame embedded successfully! Shape: (512,)


In [8]:
import numpy as np
import os
from tqdm import tqdm
import torch, open_clip, os
from PIL import Image

frames_dir = "frames_all"
frame_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])

embeddings = []
for f in tqdm(frame_files, desc="Embedding frames"):
    img = Image.open(f).convert("RGB")
    img_tensor = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = model.encode_image(img_tensor)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    embeddings.append(emb.cpu().numpy().squeeze(0))

embeddings = np.vstack(embeddings)
print("‚úÖ All frame embeddings ready:", embeddings.shape)


Embedding frames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5638/5638 [09:55<00:00,  9.46it/s]

‚úÖ All frame embeddings ready: (5638, 512)





In [9]:
from sklearn.metrics.pairwise import cosine_similarity

threshold = 0.85   # try 0.8 if too few frames remain, 0.9 if too many
keyframes = [0]    # keep the first frame by default

for i in range(1, len(embeddings)):
    sim = cosine_similarity([embeddings[i]], [embeddings[keyframes[-1]]])[0][0]
    if sim < threshold:
        keyframes.append(i)

print(f"‚úÖ Selected {len(keyframes)} keyframes out of {len(embeddings)}")


‚úÖ Selected 86 keyframes out of 5638


In [10]:
output_dir = "keyframes2"
os.makedirs(output_dir, exist_ok=True)

for idx in keyframes:
    img_path = frame_files[idx]
    img = Image.open(img_path)
    img.save(os.path.join(output_dir, os.path.basename(img_path)))

print(f"‚úÖ Saved {len(keyframes)} keyframes to {output_dir}/")


‚úÖ Saved 86 keyframes to keyframes2/


In [20]:
#! pip install transformers==4.30.2 timm==0.6.13 pillow


In [None]:
üé• Video
‚Üì
üß© Extract frames (scene-change based)
‚Üì
üñºÔ∏è Caption frames (BLIP)
‚Üì
üî¢ Compute embeddings ‚Üí weights
‚Üì
üéûÔ∏è Cluster similar frames (scenes)
‚Üì
‚úçÔ∏è Weighted + diverse caption aggregation
‚Üì
üß† Summarization (BART / T5 / LLaMA)
‚Üì
üìú Final summary (context-rich & highlight-aware)


In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda" if torch.cuda.is_available() else "cpu")

# Folder containing keyframes
frames_folder = "keyframes"   # path where your 43 keyframes are saved
output_captions = {}

# Generate captions
for frame in sorted(os.listdir(frames_folder)):
    if frame.endswith((".jpg", ".png")):
        img_path = os.path.join(frames_folder, frame)
        raw_image = Image.open(img_path).convert("RGB")

        inputs = processor(raw_image, return_tensors="pt").to(model.device)
        out = model.generate(**inputs, max_new_tokens=30)

        caption = processor.decode(out[0], skip_special_tokens=True)
        output_captions[frame] = caption
        print(f"{frame}: {caption}")

# Optional: Save to file
import json
with open("captions.json", "w") as f:
    json.dump(output_captions, f, indent=4)

print("‚úÖ Captions generated and saved to captions.json!")

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import json

# Load captions
with open("captions.json") as f:
    captions = json.load(f)

# Step 1: Create embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
caption_texts = list(captions.values())
embeddings = model.encode(caption_texts, convert_to_tensor=True)

# Step 2: Compute pairwise similarity matrix
cosine_sim = util.cos_sim(embeddings, embeddings).cpu().numpy()

# Step 3: Compute weights (e.g., uniqueness)
# Lower average similarity = more unique frame = higher importance
weights = 1 - cosine_sim.mean(axis=1)

# Step 4: Sort frames by weight
frame_names = list(captions.keys())
ranked = sorted(zip(frame_names, weights), key=lambda x: x[1], reverse=True)

# Top keyframes
top_frames = [f for f, w in ranked[:10]]
print("üéØ Top keyframes:", top_frames)


üßæ Video Summary:
a chephaus chephaus chephaus in the grass two wildes are walking through the tall grass a wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde a lion is walking through the tall grass a cow is standing in a field with a bird a wilde cow is seen in this und - news video a zebra is walking through the tall grass a zebra standing in a field of grass a bird is sitting on a branch in the middle of a field a wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde a bird is standing in the grass a lion and a lioness in the wild a group of buffalos running through a field a bird is flying in the air over a field lion attacks a lion in the wild a wilde and a wilde running in the wild a lion chasing a wilde in the wild lion attacks a lion in the wild a large elephant is running through the grass a wilde running through the grass in the wild a larg

In [1]:
pip uninstall -y transformers

Note: you may need to restart the kernel to use updated packages.




In [None]:
a chephaus chephaus chephaus in the grass two wildes are walking through the tall grass a wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde a lion is walking through the tall grass a cow is standing in a field with a bird a wilde cow is seen in this und - news video a zebra is walking through the tall grass a zebra standing in a field of grass a bird is sitting on a branch in the middle of a field a wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde a bird is standing in the grass a lion and a lioness in the wild a group of buffalos running through a field a bird is flying in the air over a field lion attacks a lion in the wild a wilde and a wilde running in the wild a lion chasing a wilde in the wild lion attacks a lion in the wild a large elephant is running through the grass a wilde running through the grass in the wild a large animal standing in a field a herd of wildes in the wild a bird is standing in the middle of a field a zebra running through the grass in the wild a zebra running through the brush in the wild a wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde wilde a large ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant a couple of animals fighting in a field a large bird is standing in the grass a man riding a horse through a field a lion is running through the grass a close up of a horse ' s face a herd of cattle grazing in a dry field a lion is seen in this und - toned video a group of lions walking through a field a lion walking through tall grass in the wild lion cubs play with a dead zebra a large herd of cattle a field with a tree in the middle

In [1]:
import moviepy.editor  

step 1 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import clip
import cv2, numpy as np
from moviepy.editor import VideoFileClip, concatenate_videoclips
import whisper

# =============================
# 1. Load pretrained models
# =============================
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
whisper_model = whisper.load_model("large-v3")

# Lightweight temporal attention network
class TemporalAttention(nn.Module):
    def __init__(self, dim, heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=heads, batch_first=True)
        self.fc = nn.Linear(dim, 1)

    def forward(self, x):
        # x: [T, D]
        attn_out, _ = self.attn(x, x, x)
        scores = self.fc(attn_out).squeeze(-1)  # [T]
        weights = torch.softmax(scores, dim=0)
        return weights.detach().cpu().numpy()

temporal_model = TemporalAttention(dim=512).to(device).eval()

# =============================
# 2. Frame extraction
# =============================
def extract_frames(video_path, stride=10):
    cap = cv2.VideoCapture(video_path)
    frames, times = [], []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret: break
        if idx % stride == 0:
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            times.append(cap.get(cv2.CAP_PROP_POS_MSEC))
        idx += 1
    cap.release()
    return frames, times

# =============================
# 3. CLIP embeddings
# =============================
def get_clip_embeddings(frames):
    embs = []
    with torch.no_grad():
        for f in frames:
            img = clip_preprocess(Image.fromarray(f)).unsqueeze(0).to(device)
            feat = clip_model.encode_image(img)
            feat = F.normalize(feat, dim=-1)
            embs.append(feat)
    embs = torch.cat(embs, dim=0)  # [T,512]
    return embs

# =============================
# 4. Compute attention weights
# =============================
def compute_importance(embs):
    with torch.no_grad():
        weights = temporal_model(embs.unsqueeze(0)).flatten()
    return weights / weights.max()

# =============================
# 5. Select key frames
# =============================
def select_keyframes(frames, weights, top_k=10):
    idx = np.argsort(weights)[-top_k:]
    return [frames[i] for i in idx], weights[idx]

# =============================
# 6. Whisper transcription
# =============================
def transcribe_audio(video_path):
    result = whisper_model.transcribe(video_path)
    return result["text"]

# =============================
# 7. Main pipeline
# =============================
from PIL import Image

video_path = "Lion vs. Wildebeest_ How Lions Hunt as a Pride (1).mp4"
frames, times = extract_frames(video_path, stride=15)
clip_embs = get_clip_embeddings(frames)
importance_weights = compute_importance(clip_embs)
key_frames, key_weights = select_keyframes(frames, importance_weights, top_k=12)
transcript = transcribe_audio(video_path)

print("Transcript snippet:\n", transcript[:250], "...")
print("\nKey-frame importance weights:\n", key_weights)


ModuleNotFoundError: No module named 'clip'

In [4]:
from datasets import load_dataset

# Try loading a real available video-caption dataset
dataset = load_dataset("facebook/hd-vila", split="validation[:5]")

for i in range(3):
    print(f"üé• Video Path: {dataset[i]['video']}")
    print(f"üìù Caption: {dataset[i]['caption']}\n")


DatasetNotFoundError: Dataset 'facebook/hd-vila' doesn't exist on the Hub or cannot be accessed.

In [None]:
import sys
print(sys.executable)
