In [None]:
import soundfile as sf

from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

MODEL_PATH = "/storage/scratch/saichandc/Qwen3-Omni-30B-A3B-Thinking"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    dtype="auto",
    device_map="auto",
    #attn_implementation="flash_attention_2", -- commenting this as not using GPU currently
)

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"},
            {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"},
            {"type": "text", "text": "What can you see and hear? Answer in one short sentence."}
        ],
    },
]

# Set whether to use audio in video
USE_AUDIO_IN_VIDEO = True

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = processor(text=text, 
                   audio=audios, 
                   images=images, 
                   videos=videos, 
                   return_tensors="pt", 
                   padding=True, 
                   use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, 
                                 speaker="Ethan", 
                                 thinker_return_dict_in_generate=True,
                                 use_audio_in_video=USE_AUDIO_IN_VIDEO)

text = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :],
                              skip_special_tokens=True,
                              clean_up_tokenization_spaces=False)
print(text)
if audio is not None:
    sf.write(
        "output.wav",
        audio.reshape(-1).detach().cpu().numpy(),
        samplerate=24000,
    )


In [None]:
print(__import__('datetime').datetime.now().strftime("%H:%M:%S"))

In [None]:
# import os
# import torch
# import numpy as np
# from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
# from qwen_omni_utils import process_mm_info
# from tqdm import tqdm
# import subprocess

# MODEL_PATH = "/storage/scratch/saichandc/Qwen3-Omni-30B-A3B-Thinking"
# VIDEO_PATH = "/storage/home/saichandc/video/90secvideo.mp4"
# USE_AUDIO_IN_VIDEO = True
# FIXED_TEXT = "Commission on Presidential"

# -*- coding: utf-8 -*-
"""
Working code - tested on 90 sec video. 
Mini-run (2 seconds) → per-second multimodal embeddings with Qwen3-Omni (MoviePy v2)

Key fixes:
- Use absolute paths for segments
- Convert segment paths to proper file URIs: Path(...).resolve().as_uri()
- Remove 'verbose'/'logger' args from write_videofile (MoviePy v2)
- Talker disabled + USE_AUDIO_IN_VIDEO=True
- Error handling + tqdm progress bars

Refs:
- MoviePy v2 import & examples: https://pypi.org/project/moviepy/            # top-level import, v2 API
- qwen-omni-utils file path/URI usage: https://pypi.org/project/qwen-omni-utils/  # 'file:///path/to/...'
"""

import os
import math
import logging
from typing import List, Dict
from pathlib import Path
import time

import numpy as np
import torch
from tqdm import tqdm
from moviepy import VideoFileClip  # MoviePy v2 import (not moviepy.editor)

from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

# ----------------------------
# Configuration
# ----------------------------
MODEL_PATH = "/storage/scratch/saichandc/Qwen3-Omni-30B-A3B-Thinking"   # or "Qwen/Qwen3-Omni-30B-A3B-Thinking"
VIDEO_PATH = "/storage/home/saichandc/video/90secvideo.mp4"                                  # <-- change to your video path
OUT_DIR = "./tmp_slices_v2"
SAVE_EMBEDDINGS_NPY = "./second_level_embeddings.npy"

PLACEHOLDER_TEXT = "Process this 1-second slice."
USE_AUDIO_IN_VIDEO = True
BATCH_SIZE = 2
MAX_SECS = None     # only first 2 seconds now; set None for full length later

LOG_LEVEL = logging.INFO
os.makedirs(OUT_DIR, exist_ok=True)
OUT_DIR = str(Path(OUT_DIR).resolve())  # <-- absolutize output dir

logging.basicConfig(level=LOG_LEVEL, format="[%(levelname)s] %(message)s")
logger = logging.getLogger("qwen_embeddings")

# ----------------------------
# Helper: masked mean pooling
# ----------------------------
def pooled_last_hidden_state(last_hidden: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).float()  # [B, L, 1]
    summed = (last_hidden * mask).sum(dim=1)     # [B, H]
    denom = mask.sum(dim=1).clamp(min=1e-6)      # [B, 1]
    return summed / denom

# ----------------------------
# Load Qwen model & processor
# ----------------------------
def load_qwen():
    try:
        model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
            MODEL_PATH, dtype="auto", device_map="auto", trust_remote_code=True,
        )
        processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True,)
    except Exception as e:
        logger.exception("Failed to load Qwen model/processor.")
        raise SystemExit(e)

    # Save ~2GB if you don't need audio outputs
    try:
        model.disable_talker()
        logger.info("Talker disabled (no audio generation).")
    except Exception as e:
        logger.warning("Could not disable Talker explicitly; continuing without audio generation. Details: %s", e)

    return model, processor

# ----------------------------
# Slice video → 1-second MP4 segments (MoviePy v2)
# ----------------------------
def slice_video_v2(video_path: str, out_dir: str, max_secs: int | None) -> List[Dict]:
    """
    Export 1-second segments with MoviePy v2 `.subclipped`.
    Return [{"sec": t, "video_uri": file:///..., "video_path": /abs/path.mp4}, ...]
    """
    vp = Path(video_path).resolve()
    if not vp.exists():
        raise FileNotFoundError(f"Video not found: {vp}")

    try:
        clip = VideoFileClip(str(vp))
    except Exception as e:
        logger.exception("Failed to open video with MoviePy v2.")
        raise

    duration = float(clip.duration or 0.0)
    total_secs = math.ceil(duration)

    if max_secs is None:
        n_secs = total_secs
        logger.info("Processing FULL length: %d second(s).", n_secs)
    else:
        n_secs = min(max_secs, total_secs)
        logger.info("Processing FIRST %d second(s) (of %d).", n_secs, total_secs)

    items: List[Dict] = []
    logger.info("Slicing %s → %d 1-second segment(s)...", vp.name, n_secs)

    fps = getattr(clip, "fps", None)

    for t in tqdm(range(n_secs), desc="Slicing seconds", unit="s"):
        start, end = t, min(t + 1, duration)
        if end <= start:
            continue
        try:
            sub = clip.subclipped(start, end)  # MoviePy v2 API
            out_file = Path(out_dir) / f"seg_{t:06d}.mp4"

            # IMPORTANT: write without 'verbose'/'logger' (v2); keep audio
            if fps is not None:
                sub.write_videofile(str(out_file), fps=fps, audio=True, codec="libx264", audio_codec="aac")
            else:
                sub.write_videofile(str(out_file), audio=True, codec="libx264", audio_codec="aac")

            sub.close()

            # Build proper absolute path + file URI
            abs_path = out_file.resolve()
            items.append({
                "sec": t,
                "video_path": str(abs_path),
                "video_uri": abs_path.as_uri(),  # e.g., file:///storage/home/...
            })
        except Exception as se:
            logger.error("Failed to write segment %d: %s", t, se)
            continue

    clip.close()
    if not items:
        raise RuntimeError("No segments produced. Check the video and codecs.")
    logger.info("Created %d segment(s) in %s", len(items), out_dir)
    return items

# ----------------------------
# Build conversations & extract embeddings
# ----------------------------
def extract_second_level_embeddings(model, processor, sec_items: List[Dict]) -> np.ndarray:
    failed_seconds = []
    embeds_all = []

    logger.info("Building per-second conversations and extracting embeddings...")
    for i in tqdm(range(0, len(sec_items), BATCH_SIZE), desc="Embedding batches", unit="batch"):
        batch = sec_items[i:i + BATCH_SIZE]

        conversations = []
        for it in batch:
            # You can use either 'video_uri' (file:///...) or 'video_path' (plain absolute path).
            # Both are accepted by qwen-omni-utils (docs show 'file:///...' explicitly).  # see PyPI page
            conversations.append([
                {
                    "role": "user",
                    "content": [
                        {"type": "video", "video": it["video_uri"]},
                        {"type": "text",  "text": PLACEHOLDER_TEXT},
                    ],
                }
            ])

        try:
            text = processor.apply_chat_template(
                conversations, add_generation_prompt=False, tokenize=False
            )
            audios, images, videos = process_mm_info(
                conversations, use_audio_in_video=USE_AUDIO_IN_VIDEO
            )
            inputs = processor(
                text=text,
                audio=audios,
                images=images,
                videos=videos,
                return_tensors="pt",
                padding=True,
                use_audio_in_video=USE_AUDIO_IN_VIDEO,
            ).to(model.device).to(model.dtype)
        except Exception as e:
            logger.error("Processor/prep failed for batch starting at sec %d: %s", batch[0]["sec"], e)
            # Helpful debug: print the exact URI/path we passed
            for dbg in batch:
                logger.error("   video_uri=%s | exists=%s",
                             dbg["video_uri"], Path(dbg["video_path"]).exists())
            failed_seconds.extend([it["sec"] for it in batch])
            continue

        try:
            with torch.no_grad():
                # forward pass using the processor to match multimodal input spec
                outputs = model.thinker(**inputs,output_hidden_states=True,return_dict=True)
                last_hidden = outputs.hidden_states[-1]
                pooled = pooled_last_hidden_state(last_hidden, inputs["attention_mask"])

                embeds_all.append(pooled.detach().cpu().numpy())
        except Exception as e:
            logger.error("Model forward failed for batch starting at sec %d: %s", batch[0]["sec"], e)
            failed_seconds.extend([it["sec"] for it in batch])
            continue

    if not embeds_all:
        raise RuntimeError("No embeddings produced; all batches failed.")

    embeds = np.concatenate(embeds_all, axis=0)
    if failed_seconds:
        logger.warning("Failed seconds (skipped): %s", failed_seconds)
    else:
        logger.info("All seconds processed successfully.")
    return embeds

# ----------------------------
# Entry point
# ----------------------------
def main():
    start_total = time.time()

    # ---- Load model ----
    t0 = time.time()
    model, processor = load_qwen()
    logger.info("Model + processor loaded in %.2f sec", time.time() - t0)

    # ---- Slice video ----
    t1 = time.time()
    try:
        sec_items = slice_video_v2(VIDEO_PATH, OUT_DIR, max_secs=MAX_SECS)
    except Exception as e:
        logger.exception("Slicing failed.")
        raise SystemExit(e)
    logger.info("Video slicing completed in %.2f sec", time.time() - t1)

    # ---- Extract embeddings ----
    t2 = time.time()
    try:
        embeds = extract_second_level_embeddings(model, processor, sec_items)
    except Exception as e:
        logger.exception("Embedding extraction failed.")
        raise SystemExit(e)
    logger.info("Embedding extraction completed in %.2f sec", time.time() - t2)

    # ---- Save embeddings ----
    np.save(SAVE_EMBEDDINGS_NPY, embeds)
    logger.info("Saved embeddings: shape=%s -> %s", embeds.shape, SAVE_EMBEDDINGS_NPY)

    # ---- Total time ----
    logger.info("Total runtime: %.2f sec (%.2f min)", time.time() - start_total, (time.time() - start_total)/60)

if __name__ == "__main__":
    main()


In [None]:
# -*- coding: utf-8 -*-
"""
Working code - tested on 90 sec video.
Per-second multimodal embeddings with Qwen3-Omni.
Now slices via FFmpeg CLI for reliability (audio intact), with timing.

Key fixes:
- Use FFmpeg CLI to slice 1-second segments (keeps audio reliably).
- Absolute paths + proper file URIs (Path(...).resolve().as_uri()).
- Talker disabled + USE_AUDIO_IN_VIDEO=True.
- Error handling + tqdm progress bars.
- Step timing logs added.

Refs:
- MoviePy v2 import & examples: https://pypi.org/project/moviepy/
- qwen-omni-utils file path/URI usage: https://pypi.org/project/qwen-omni-utils/
"""

import os
import math
import logging
from typing import List, Dict
from pathlib import Path
import time
import subprocess
import shlex

import numpy as np
import torch
from tqdm import tqdm
# MoviePy is only used as a fallback to read duration if ffprobe is unavailable.
from moviepy import VideoFileClip

from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

# ----------------------------
# Configuration
# ----------------------------
MODEL_PATH = "/storage/scratch/saichandc/Qwen3-Omni-30B-A3B-Thinking"   # or "Qwen/Qwen3-Omni-30B-A3B-Thinking"
VIDEO_PATH = "/storage/home/saichandc/video/90secvideo.mp4"             # <-- change to your video path
OUT_DIR = "./tmp_slices_v2"
SAVE_EMBEDDINGS_NPY = "./second_level_embeddings.npy"

PLACEHOLDER_TEXT = "Process this 1-second slice."
USE_AUDIO_IN_VIDEO = True
BATCH_SIZE = 2
MAX_SECS = None   # None for full length; set an int for a subset
PREFER_FFMPEG_SLICER = True  # set False to use MoviePy slicer (not recommended for audio)

LOG_LEVEL = logging.INFO
os.makedirs(OUT_DIR, exist_ok=True)
OUT_DIR = str(Path(OUT_DIR).resolve())  # absolutize output dir

logging.basicConfig(level=LOG_LEVEL, format="[%(levelname)s] %(message)s")
logger = logging.getLogger("qwen_embeddings")

# ----------------------------
# Helper: masked mean pooling
# ----------------------------
def pooled_last_hidden_state(last_hidden: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).float()  # [B, L, 1]
    summed = (last_hidden * mask).sum(dim=1)     # [B, H]
    denom = mask.sum(dim=1).clamp(min=1e-6)      # [B, 1]
    return summed / denom

# ----------------------------
# Load Qwen model & processor
# ----------------------------
def load_qwen():
    try:
        model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
            MODEL_PATH, dtype="auto", device_map="auto", trust_remote_code=True,
        )
        processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True,)
    except Exception as e:
        logger.exception("Failed to load Qwen model/processor.")
        raise SystemExit(e)

    # Save ~2GB if you don't need audio outputs
    try:
        model.disable_talker()
        logger.info("Talker disabled (no audio generation).")
    except Exception as e:
        logger.warning("Could not disable Talker explicitly; continuing without audio generation. Details: %s", e)

    return model, processor

# ----------------------------
# FFmpeg helpers
# ----------------------------
def ffmpeg_available() -> bool:
    """Return True if ffmpeg is available."""
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        return True
    except Exception:
        return False

def probe_duration_ffprobe(vp: Path) -> float:
    """Probe video duration using ffprobe; fallback to MoviePy if unavailable."""
    cmd = f'ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "{vp}"'
    try:
        res = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
        return float(res.stdout.strip())
    except Exception:
        clip = VideoFileClip(str(vp))
        duration = float(clip.duration or 0.0)
        clip.close()
        return duration

# ----------------------------
# Slice video → 1-second MP4 segments (FFmpeg CLI; robust for audio)
# ----------------------------
def slice_video_ffmpeg(video_path: str, out_dir: str, max_secs: int | None) -> List[Dict]:
    """
    Export 1-second segments using FFmpeg CLI (robust + keeps audio).
    Return [{"sec": t, "video_uri": file:///..., "video_path": /abs/path.mp4, "slice_time_sec": float}, ...]
    """
    vp = Path(video_path).resolve()
    if not vp.exists():
        raise FileNotFoundError(f"Video not found: {vp}")

    duration = probe_duration_ffprobe(vp)
    total_secs = math.ceil(duration)
    n_secs = total_secs if max_secs is None else min(max_secs, total_secs)
    logger.info("FFmpeg slicing %s → %d second(s) (of %d).", vp.name, n_secs, total_secs)

    items: List[Dict] = []
    for t in tqdm(range(n_secs), desc="Slicing seconds", unit="s"):
        seg_start = time.time()
        start, end = t, min(t + 1, duration)
        if end <= start:
            continue

        out_file = Path(out_dir) / f"seg_{t:06d}.mp4"

        # Resume-friendly: skip if already exists and non-empty
        if out_file.exists() and out_file.stat().st_size > 0:
            abs_path = out_file.resolve()
            items.append({
                "sec": t,
                "video_path": str(abs_path),
                "video_uri": abs_path.as_uri(),
                "slice_time_sec": 0.0,
            })
            continue

        # Use AAC audio + H.264 video; map first audio stream optionally (if present)
        cmd = (
            f'ffmpeg -hide_banner -nostdin -y -loglevel error '
            f'-ss {start:.3f} -to {end:.3f} -i "{vp}" '
            f'-map 0:v:0 -map 0:a:0? '
            f'-c:v libx264 -preset veryfast -crf 23 '
            f'-c:a aac -b:a 128k '
            f'"{out_file}"'
        )
        try:
            subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        except subprocess.CalledProcessError as e:
            err = e.stderr if isinstance(e.stderr, str) else e.stderr.decode(errors="ignore")
            snippet = "\n".join(err.splitlines()[-10:])
            logger.error("FFmpeg failed at segment %d (rc=%s). Last lines:\n%s", t, e.returncode, snippet)
            continue

        abs_path = out_file.resolve()
        items.append({
            "sec": t,
            "video_path": str(abs_path),
            "video_uri": abs_path.as_uri(),  # e.g., file:///storage/home/...
            "slice_time_sec": round(time.time() - seg_start, 3),
        })
        logger.info("FFmpeg segment %06d written in %.2f sec", t, time.time() - seg_start)

    if not items:
        raise RuntimeError("No segments produced by FFmpeg. Check installation/codecs.")
    logger.info("FFmpeg created %d segment(s) in %s", len(items), out_dir)
    return items

# ----------------------------
# (Original) Slice video with MoviePy v2 (kept for fallback; not recommended for audio)
# ----------------------------
def slice_video_v2(video_path: str, out_dir: str, max_secs: int | None) -> List[Dict]:
    """
    Export 1-second segments with MoviePy v2 `.subclipped`.
    Return [{"sec": t, "video_uri": file:///..., "video_path": /abs/path.mp4}, ...]
    """
    vp = Path(video_path).resolve()
    if not vp.exists():
        raise FileNotFoundError(f"Video not found: {vp}")

    try:
        clip = VideoFileClip(str(vp))
    except Exception as e:
        logger.exception("Failed to open video with MoviePy v2.")
        raise

    duration = float(clip.duration or 0.0)
    total_secs = math.ceil(duration)
    n_secs = total_secs if max_secs is None else min(max_secs, total_secs)
    logger.info("Processing %s → %d 1-second segment(s) (of %d).", vp.name, n_secs, total_secs)

    items: List[Dict] = []
    fps = getattr(clip, "fps", None)

    for t in tqdm(range(n_secs), desc="Slicing seconds", unit="s"):
        start, end = t, min(t + 1, duration)
        if end <= start:
            continue
        try:
            sub = clip.subclipped(start, end)  # MoviePy v2 API
            out_file = Path(out_dir) / f"seg_{t:06d}.mp4"

            # IMPORTANT: write without 'verbose'/'logger' (v2); keep audio
            if fps is not None:
                sub.write_videofile(str(out_file), fps=fps, audio=True, codec="libx264", audio_codec="aac")
            else:
                sub.write_videofile(str(out_file), audio=True, codec="libx264", audio_codec="aac")

            sub.close()

            abs_path = out_file.resolve()
            items.append({
                "sec": t,
                "video_path": str(abs_path),
                "video_uri": abs_path.as_uri(),
            })
        except Exception as se:
            logger.error("Failed to write segment %d: %s", t, se)
            continue

    clip.close()
    if not items:
        raise RuntimeError("No segments produced. Check the video and codecs.")
    logger.info("Created %d segment(s) in %s", len(items), out_dir)
    return items

# ----------------------------
# Build conversations & extract embeddings
# ----------------------------
def extract_second_level_embeddings(model, processor, sec_items: List[Dict]) -> np.ndarray:
    failed_seconds = []
    embeds_all = []

    logger.info("Building per-second conversations and extracting embeddings...")
    for i in tqdm(range(0, len(sec_items), BATCH_SIZE), desc="Embedding batches", unit="batch"):
        batch = sec_items[i:i + BATCH_SIZE]

        conversations = []
        for it in batch:
            conversations.append([
                {
                    "role": "user",
                    "content": [
                        {"type": "video", "video": it["video_uri"]},
                        {"type": "text",  "text": PLACEHOLDER_TEXT},
                    ],
                }
            ])

        try:
            text = processor.apply_chat_template(
                conversations, add_generation_prompt=False, tokenize=False
            )
            audios, images, videos = process_mm_info(
                conversations, use_audio_in_video=USE_AUDIO_IN_VIDEO
            )
            inputs = processor(
                text=text,
                audio=audios,
                images=images,
                videos=videos,
                return_tensors="pt",
                padding=True,
                use_audio_in_video=USE_AUDIO_IN_VIDEO,
            ).to(model.device).to(model.dtype)
        except Exception as e:
            logger.error("Processor/prep failed for batch starting at sec %d: %s", batch[0]["sec"], e)
            for dbg in batch:
                logger.error("   video_uri=%s | exists=%s",
                             dbg["video_uri"], Path(dbg["video_path"]).exists())
            failed_seconds.extend([it["sec"] for it in batch])
            continue

        try:
            with torch.no_grad():
                outputs = model.thinker(**inputs, output_hidden_states=True, return_dict=True)
                last_hidden = outputs.hidden_states[-1]
                pooled = pooled_last_hidden_state(last_hidden, inputs["attention_mask"])

                embeds_all.append(pooled.detach().cpu().numpy())
        except Exception as e:
            logger.error("Model forward failed for batch starting at sec %d: %s", batch[0]["sec"], e)
            failed_seconds.extend([it["sec"] for it in batch])
            continue

    if not embeds_all:
        raise RuntimeError("No embeddings produced; all batches failed.")

    embeds = np.concatenate(embeds_all, axis=0)
    if failed_seconds:
        logger.warning("Failed seconds (skipped): %s", failed_seconds)
    else:
        logger.info("All seconds processed successfully.")
    return embeds

# ----------------------------
# Entry point
# ----------------------------
def main():
    start_total = time.time()

    # ---- Load model ----
    t0 = time.time()
    model, processor = load_qwen()
    logger.info("Model + processor loaded in %.2f sec", time.time() - t0)

    # ---- Slice video ----
    t1 = time.time()
    try:
        if PREFER_FFMPEG_SLICER and ffmpeg_available():
            sec_items = slice_video_ffmpeg(VIDEO_PATH, OUT_DIR, max_secs=MAX_SECS)
        else:
            logger.warning("Using MoviePy slicer (FFmpeg unavailable or preference disabled).")
            sec_items = slice_video_v2(VIDEO_PATH, OUT_DIR, max_secs=MAX_SECS)
    except Exception as e:
        logger.exception("Slicing failed.")
        raise SystemExit(e)
    logger.info("Video slicing completed in %.2f sec", time.time() - t1)

    # ---- Extract embeddings ----
    t2 = time.time()
    try:
        embeds = extract_second_level_embeddings(model, processor, sec_items)
    except Exception as e:
        logger.exception("Embedding extraction failed.")
        raise SystemExit(e)
    logger.info("Embedding extraction completed in %.2f sec", time.time() - t2)

    # ---- Save embeddings ----
    np.save(SAVE_EMBEDDINGS_NPY, embeds)
    logger.info("Saved embeddings: shape=%s -> %s", embeds.shape, SAVE_EMBEDDINGS_NPY)

    # ---- Total time ----
    total = time.time() - start_total
    logger.info("Total runtime: %.2f sec (%.2f min)", total, total / 60)

if __name__ == "__main__":
    main()

In [None]:

import numpy as np

# Load the .npy file
data = np.load('./second_level_embeddings.npy')

# Print the first line (first element) of the array
print(data[0])


In [None]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the embeddings from the .npy file
embeddings = np.load('./second_level_embeddings.npy')

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Display the shape of the similarity matrix
print("Similarity matrix shape:", similarity_matrix.shape)

# Optionally, print a portion of the matrix
print("Sample similarity values:\n", similarity_matrix[:20, :20])


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the embeddings
embeddings = np.load('./second_level_embeddings.npy')

# Compute cosine similarity between each consecutive pair
similarities = []
for i in range(len(embeddings) - 1):
    sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
    similarities.append(sim)

# Print the results
for idx, value in enumerate(similarities):
    print(f"Similarity between vector {idx} and {idx+1}: {value}")


In [None]:
##########code for audio transcript###########

from vosk import Model, KaldiRecognizer, SetLogLevel
import wave
import json
from moviepy import VideoFileClip

# Optional: reduce Vosk logging
SetLogLevel(-1)

# 1. Extract audio from video
video = VideoFileClip("/storage/home/saichandc/video/90secvideo.mp4")
video.audio.write_audiofile("audio.wav", codec='pcm_s16le', fps=16000)

# 2. Load Vosk model with word timestamps enabled
model = Model("/storage/home/saichandc/qwen/vosk")
wf = wave.open("audio.wav", "rb")

# Important: Set words=True for word-level timestamps
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)  # Enable word-level timestamps

# 3. Process audio and collect word timestamps
word_timestamps = []

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    
    if rec.AcceptWaveform(data):
        result = json.loads(rec.Result())
        if 'result' in result:
            word_timestamps.extend(result['result'])

# Don't forget the final result
final_result = json.loads(rec.FinalResult())
if 'result' in final_result:
    word_timestamps.extend(final_result['result'])

# 4. Display word-by-word with timestamps
for word_info in word_timestamps:
    word = word_info['word']
    start = word_info['start']  # in seconds
    end = word_info['end']      # in seconds
    print(f"{start:.2f}s - {end:.2f}s: {word}")

# 5. Optional: Create second-by-second transcript
def get_transcript_by_second(word_timestamps, video_duration):
    """Group words by second"""
    transcript_by_second = {}
    
    for word_info in word_timestamps:
        word = word_info['word']
        start = int(word_info['start'])
        
        if start not in transcript_by_second:
            transcript_by_second[start] = []
        transcript_by_second[start].append(word)
    
    # Format output
    for second in range(int(video_duration) + 1):
        words = transcript_by_second.get(second, [])
        text = " ".join(words) if words else "[silence]"
        print(f"Second {second}: {text}")
    
    return transcript_by_second

# Get video duration
video_duration = video.duration
transcript = get_transcript_by_second(word_timestamps, video_duration)

In [4]:
import whisper

def get_transcript_by_second(video_path):
    # Load model
    model = whisper.load_model("large")
    
    # Transcribe with word timestamps
    result = model.transcribe(video_path, word_timestamps=True)
    
    # Organize by second
    transcript_by_second = {}
    
    for segment in result["segments"]:
        if "words" in segment:
            for word_info in segment["words"]:
                second = int(word_info["start"])
                word = word_info["word"].strip()
                
                if second not in transcript_by_second:
                    transcript_by_second[second] = []
                transcript_by_second[second].append(word)
    
    # Print second-by-second
    max_second = max(transcript_by_second.keys()) if transcript_by_second else 0
    for second in range(max_second + 1):
        words = transcript_by_second.get(second, [])
        text = " ".join(words) if words else "[silence]"
        print(f"Second {second}: {text}")
    
    return transcript_by_second

# Use it
transcript = get_transcript_by_second("/storage/home/saichandc/video/90secvideo.mp4")

100%|██████████████████████████████████████| 2.88G/2.88G [00:27<00:00, 113MiB/s]


Second 0: Until February. All right. There
Second 1: aren't 100 million people
Second 2: with preexisting conditions.
Second 3: As
Second 4: far as the say
Second 5: is concerned, the people
Second 6: already had their say.
Second 7: Okay.
Second 8: [silence]
Second 9: Justice Ginsburg
Second 10: said very
Second 11: powerfully, very strongly,
Second 12: at
Second 13: some point, 10
Second 14: years ago
Second 15: or so, she
Second 16: said a
Second 17: president and the Senate
Second 18: is elected
Second 19: for a period
Second 20: of time. But a president's
Second 21: elected for four
Second 22: years. We're not elected
Second 23: for three years. I'm
Second 24: not elected for three years.
Second 25: So we
Second 26: have the Senate. We
Second 27: have a president. He's elected
Second 28: to the next election. During
Second 29: that period of time,
Second 30: during that
Second 31: period of time, we
Second 32: have an
Second 33: opening. I'm
Second 34: not elected for three
Second

In [5]:
print(__import__('datetime').datetime.now().strftime("%H:%M:%S"))

15:25:58


In [6]:
########script for PBS ########

#!/bin/bash
#PBS -N pythonjob
#PBS -e error.log
#PBS -l select=2:ncpus=16
#PBS -q GPU
#PBS -l walltime=24:00:00
module load python/3.11.4

cd /storage/home/saichandc/qwen
source qwen_env_311/bin/activate

python whispertranscript.py

SyntaxError: invalid syntax (2289724161.py, line 9)