In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Cell 1: Install dependencies
!pip install gradio torchvision torch


Collecting gradio
  Downloading gradio-5.34.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.3 (from gradio)
  Downloading gradio_client-1.10.3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Co

In [2]:
!pip install gradio torch torchvision timm einops torchcodec \
  git+https://github.com/huggingface/transformers@v4.52.4-VJEPA-2-preview


Collecting git+https://github.com/huggingface/transformers@v4.52.4-VJEPA-2-preview
  Cloning https://github.com/huggingface/transformers (to revision v4.52.4-VJEPA-2-preview) to /tmp/pip-req-build-s62tf7jf
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-s62tf7jf
  Running command git checkout -q 8c6a8ebd9d8159805df3be45e6ef6be3d827af18
  Resolved https://github.com/huggingface/transformers to commit 8c6a8ebd9d8159805df3be45e6ef6be3d827af18
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchcodec
  Downloading torchcodec-0.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading torchcodec-0.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25

In [5]:
import gradio as gr
import torch
import numpy as np
import cv2
from transformers import AutoVideoProcessor, AutoModel
from sklearn.linear_model import LogisticRegression
import pickle

# Load V-JEPA2
hf_repo = "facebook/vjepa2-vitl-fpc64-256"
processor = AutoVideoProcessor.from_pretrained(hf_repo)
model = AutoModel.from_pretrained(hf_repo, torch_dtype=torch.float16, device_map="auto")

# Optional: Try loading classifier
try:
    probe = pickle.load(open("probe.pkl", "rb"))
    has_probe = True
except:
    has_probe = False

def read_video_opencv(video_path, num_frames=64):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idxs = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = []

    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in idxs:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
    cap.release()
    return frames

def predict(video_path):
    frames = read_video_opencv(video_path)

    if len(frames) < 10:
        return "❌ Not enough frames in video."

    # Process frames
    inputs = processor(frames, return_tensors="pt").to(model.device)
    with torch.no_grad():
        features = model.get_vision_features(**inputs)
        embedding = features.mean(dim=1).cpu().numpy()

    result = f"✅ Embedding shape: {embedding.shape}"

    if has_probe:
        label = probe.predict(embedding)[0]
        result += f"\n🎯 Predicted action: {label}"

    return result


video_preprocessor_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

In [7]:
demo = gr.Interface(
    fn=predict,
    inputs=gr.Video(label="Upload a short video"),  # Removed source="upload"
    outputs="text",
    title="🎥 V-JEPA2 Video Understanding App",
    description="This app extracts features from videos using Meta's V-JEPA2 model."
)
demo.launch()


* Running on local URL:  http://127.0.0.1:7860
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://58b65e46843556e36c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [8]:
!pip install gradio transformers decord


Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0


In [15]:
import gradio as gr
import torch
from transformers import AutoVideoProcessor, AutoModel
import decord
from decord import VideoReader, cpu
import numpy as np

# Set up model and processor
model_id = "facebook/vjepa2-vith-fpc64-256"
processor = AutoVideoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
model.eval()

# Sample labels
LABELS = ["Walking", "Running", "Cooking", "Working"]

# Fixed dummy probe with correct input dim (1280)
class DummyProbe(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(1280, len(LABELS))  # 1280 instead of 1024

    def forward(self, x):
        return torch.softmax(self.linear(x), dim=1)

probe = DummyProbe()

# Core function
def analyze_video(video_path):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    
    # Sample 16 frames uniformly
    indices = np.linspace(0, total_frames - 1, 16).astype(int)
    video_frames = vr.get_batch(indices).asnumpy()

    # Preprocess frames
    inputs = processor(videos=list(video_frames), return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)  # Should be (1, 1280)
        probs = probe(embedding)
        pred_label = LABELS[probs.argmax().item()]
        confidence = probs.max().item()

    return f"Predicted Action: {pred_label} (Confidence: {confidence:.2f})"

# Gradio app
demo = gr.Interface(
    fn=analyze_video,
    inputs=gr.Video(label="Upload a short video", format="mp4"),
    outputs="text",
    title="🎥 V-JEPA2 Action Predictor",
    description="Upload a short video and see what action V-JEPA2 thinks it contains!"
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7867
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://462cc70b600711584c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


