In [2]:
import numpy as np
ans = np.linspace(start = 0, stop = 99, num=10)
ans 

array([ 0., 11., 22., 33., 44., 55., 66., 77., 88., 99.])

In [None]:
"""
提取视频均值特征 (CLIP ViT-B/32)
python src/extract_clip.py --video_dir data/samples --out feats.npy
"""
import argparse, glob, os, json, torch, clip, numpy as np, cv2, tqdm

def sample_frames(video_path, num_frames=4):
    cap, frames = cv2.VideoCapture(video_path), []
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idxs = np.linspace(0, length - 1, num_frames, dtype=int)
    for i in idxs:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ok, frame = cap.read()
        if ok: frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames                  # list[np.ndarray]

def main(args):
    device = "mps"
    model, preprocess = clip.load("ViT-B/32", device=device)
    all_feats, names = [], []
    for vp in tqdm.tqdm(sorted(glob.glob(os.path.join(args.video_dir, "*.mp4")))):
        imgs = sample_frames(vp, 4)
        with torch.no_grad():
            batch = torch.stack([preprocess(Image.fromarray(im)) for im in imgs]).to(device)
            feats = model.encode_image(batch).float()          # [N,512]
        all_feats.append(feats.mean(0).cpu().numpy())          # mean-pool
        names.append(os.path.basename(vp))
    np.save(args.out, np.stack(all_feats))
    json.dump(names, open(args.out.replace(".npy", ".json"), "w"))
if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--video_dir", required=True)
    p.add_argument("--out", default="feats.npy")
    main(p.parse_args())

In [None]:
"""
FAISS 余弦索引 (InnerProduct + L2-norm)
"""
import numpy as np, faiss, json, argparse

p = argparse.ArgumentParser()
p.add_argument("--feats", default="feats.npy")
p.add_argument("--names", default="feats.json")
p.add_argument("--out",   default="clip.index")
args = p.parse_args()

xb = np.load(args.feats).astype("float32")
faiss.normalize_L2(xb)
index = faiss.IndexFlatIP(xb.shape[1])
index.add(xb)
faiss.write_index(index, args.out)
print("Indexed", index.ntotal, "videos")

In [None]:
"""
python src/search_clip.py --query "a cat on sofa" --topk 5
"""
import argparse, clip, torch, faiss, numpy as np, json

p = argparse.ArgumentParser()
p.add_argument("--index", default="clip.index")
p.add_argument("--names", default="feats.json")
p.add_argument("--query", required=True)
p.add_argument("--topk",  type=int, default=5)
args = p.parse_args()

device = "mps"
model, preprocess = clip.load("ViT-B/32", device=device)
with torch.no_grad():
    q = model.encode_text(clip.tokenize([args.query]).to(device)).float()
q = q / q.norm(dim=-1, keepdim=True)
index = faiss.read_index(args.index)
D, I = index.search(q.cpu().numpy(), args.topk)
names = json.load(open(args.names))
for score, idx in zip(D[0], I[0]):
    print(f"{score:.3f}\t{names[idx]}")

In [None]:
"""
python src/extract_subs.py --video_dir data/samples/video
→ 在 data/samples/subtitles/ 生成 *.srt
"""
import os, subprocess, argparse, pathlib, tqdm

def dump_sub(video_path, out_dir):
    vid = pathlib.Path(video_path).stem
    srt_out = f"{out_dir}/{vid}.srt"
    if os.path.exists(srt_out):         # 跳过已提取
        return
    # ① 先试 ffmpeg 内嵌字幕流
    cmd = ["ffmpeg", "-y", "-i", video_path, "-map", "0:s:0",
           srt_out, "-loglevel", "quiet"]
    if subprocess.call(cmd) != 0:
        # ② 若失败，生成空 srt，占位
        open(srt_out, "w").close()

def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    for mp4 in tqdm.tqdm(sorted(pathlib.Path(args.video_dir).glob("*.mp4"))):
        dump_sub(str(mp4), args.out_dir)

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--video_dir", required=True)
    p.add_argument("--out_dir", default="data/samples/subtitles")
    main(p.parse_args())