In [None]:
!pip install --upgrade "pyarrow>=21.0.0"
!pip install -q "transformers>=4.57.0"
!pip install -q datasets av
!pip install -q bitsandbytes accelerate
!pip install "pydantic<2.12" --no-deps

In [None]:
import pickle
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
class EmbeddingGenerationPipeline():
    def __init__(self):        
        self.model = SentenceTransformer(
            "Qwen/Qwen3-Embedding-4B",
            model_kwargs={
                "device_map": "cuda:0",
                "load_in_4bit": True,
                "dtype": torch.bfloat16
            },
            tokenizer_kwargs={"padding_side": "left"}
        )

    def encode(self, captions):
        embeddings = self.model.encode(
            captions,
            batch_size=64,
            show_progress_bar=True,
            convert_to_numpy=True,
        )
        return embeddings

In [None]:
with open("msrvtt_captions.pkl", 'rb') as f:
    df = pickle.load(f)

pipeline = EmbeddingGenerationPipeline()
df['embedding'] = list(pipeline.encode(df['generated_caption'].tolist()))

with open("msrvtt_embeddings", 'wb') as f:
    pickle.dump(df, f)