# Create Embeddings for titles and thumbnails

This script generates embeddings for video titles and thumbnails.

## Text Embeddings

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device='cuda')

df = pd.read_parquet("smda/project/data/random_20000_scores.parquet")

In [3]:
titles = df['title'].tolist()
embeddings = model.encode(titles)

In [4]:
ids = df['display_id'].tolist()

title_embeddings = {ids[i]: embeddings[i] for i in range(len(ids))}

TEXT_OUTPUT_FILE = "smda/project/data/title_embeddings.npz"

np.savez(TEXT_OUTPUT_FILE, title_embeddings)

## Image embeddings

In [5]:
import os
import torch
import clip
from PIL import Image
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
import pandas as pd

In [6]:
df = pd.read_parquet("smda/project/data/random_20000_scores.parquet")

In [7]:
df.head()

Unnamed: 0,display_id,categories,channel_id,crawl_date,description,dislike_count,duration,like_count,tags,title,...,log_subs,log_view_count,predicted_view_count,performance_score,view_count_difference,like_dislike_ratio,dislike_views_ratio,like_views_ratio,clickbait_score,clickbait
0,xTDVWRSmwZM,Gaming,UCYVinkwSX7szARULgYpvhLw,2019-11-11,KINO DER UNTOTEN REMASTERED GAMEPLAY! (BO3 Zom...,1878.0,2265,28276.0,"black ops 3,black ops 3 zombies nacht der unto...",🔫 Pistol + Knife *ONLY* CHALLENGE! 🔪 - Nacht D...,...,15.994786,14.402278,12.225688,1.178034,-2.17659,0.93772,0.001044,0.015725,3.960614,False
1,m4upkcabhog,Travel & Events,UCm23p6UpTcByr980IuiXf7g,2019-11-02,🌟⭐️ WATCH LATEST DISNEY VIDEO: http://vid.io/x...,6.0,594,225.0,"lightcycle,shanghai disney resort,tron,shangha...",[HD] FULL POV TRON Lightcycle Power Run | Tour...,...,10.205678,9.700575,10.909698,0.88917,1.209123,0.974026,0.000367,0.013781,1.884791,False
2,ezYjJlL0IgU,News & Politics,UC9k-yiEpRHMNVOnOi_aQK8w,2019-11-02,More from Inside Edition: https://www.youtube....,210.0,67,3053.0,"memphis nail salon,denies,rose nails memphis,p...",Nail Salon Owner Denies He Charges Overweight ...,...,13.607274,12.653115,11.401222,1.109803,-1.251892,0.935642,0.000671,0.009762,6.592548,False
3,RM-i7-elgJQ,Entertainment,UCv61IAZLDELwk2Kk5hfX0mg,2019-11-01,Join me on this adventure as I am hiking in th...,60.0,371,2716.0,"adventure,nature,searching,collecting,collect,...",I Found A New Cave!,...,13.500285,11.253675,11.654417,0.965615,0.400742,0.978386,0.000778,0.035198,0.61406,False
4,gRggH5N2dPY,Gaming,UCOA1yrrvuBsl0ifIioiTNXQ,2019-11-20,Let's Play Prophesy of Pendor 3.8.4 - The Moun...,5.0,997,458.0,"TAGS:,Prophesy,of,Pendor,Prophecy,Mount,And,Bl...",Let's Play Prophesy of Pendor 3.8.4 Gameplay -...,...,11.94755,9.577065,11.267419,0.849979,1.690354,0.989201,0.000347,0.031739,0.340244,False


In [8]:
IMAGE_FOLDER = "smda/project/data/thumbnails"
OUTPUT_FILE = "smda/project/data/image_embeddings.npz"
BATCH_SIZE = 32
MODEL_NAME = "ViT-B/32"
USE_FP16 = False

image_ids = df["display_id"].tolist()

# add padding since embedding model requires square image
def square_pad(img):
    w, h = img.size
    max_dim = max(w, h)
    pad_w = (max_dim - w) // 2
    pad_h = (max_dim - h) // 2
    padding = (pad_w, pad_h, max_dim - w - pad_w, max_dim - h - pad_h)
    return transforms.functional.pad(img, padding, fill=0, padding_mode='constant')

# transformations: resizing, padding and normalization
custom_preprocess = transforms.Compose([
    transforms.Lambda(square_pad),
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                         std=[0.26862954, 0.26130258, 0.27577711]),
])

device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = clip.load(MODEL_NAME, device=device)

if USE_FP16 and device == "cuda":
    model = model.half()

class ImageDataset(Dataset):
    def __init__(self, image_folder, image_ids, transform):
        self.image_folder = image_folder
        self.image_ids = image_ids
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        path = os.path.join(self.image_folder, f"{image_id}.jpg")

        try:
            image = Image.open(path).convert("RGB")
        except Exception as e:
            print(f"⚠️ Warning: Could not load image {path} — {e}")
            image = Image.new("RGB", (224, 224), color=(0, 0, 0))  # fallback blank

        image_tensor = self.transform(image)
        if USE_FP16:
            image_tensor = image_tensor.half()
        return image_tensor, image_id

dataset = ImageDataset(IMAGE_FOLDER, image_ids, custom_preprocess)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

model.eval()
embeddings = {}

with torch.no_grad():
    for images, batch_ids in tqdm(dataloader, desc="Embedding images"):
        images = images.to(device)
        features = model.encode_image(images)
        features = features / features.norm(dim=-1, keepdim=True)  # normalize

        for img_id, emb in zip(batch_ids, features.cpu()):
            embeddings[img_id] = emb.numpy()

# ---- SAVE TO NPZ ----
np.savez(OUTPUT_FILE, **embeddings)
print(f"\n✅ Saved {len(embeddings)} embeddings to '{OUTPUT_FILE}'")

# ---- OPTIONAL: PRINT SAMPLE ----
sample_key = list(embeddings.keys())[0]
print(f"Sample: '{sample_key}' => shape {embeddings[sample_key].shape}")

Embedding images: 100%|██████████| 1240/1240 [41:12<00:00,  1.99s/it]



✅ Saved 39676 embeddings to 'smda/project/data/image_embeddings.npz'
Sample: 'xTDVWRSmwZM' => shape (512,)


In [41]:
embeddings

{'Os__doddX24': array([ 3.3779e-03, -9.8648e-03,  1.2589e-03, -5.8228e-02,  7.8354e-03,
         4.1618e-03, -1.8597e-03,  1.0138e-01, -9.4681e-03,  3.9864e-03,
        -3.6373e-03,  2.2324e-02,  8.0261e-02, -1.3481e-02,  3.2410e-02,
        -2.8580e-02,  3.9368e-02, -3.1776e-03,  1.3512e-02, -4.4922e-02,
        -5.2124e-02,  1.2642e-02,  8.5907e-03, -4.6478e-02,  1.8021e-02,
        -1.1032e-02,  1.3199e-02, -7.6752e-03, -1.3420e-02,  1.2894e-02,
        -7.0333e-04, -1.4244e-02,  6.6719e-03,  6.0425e-02, -1.9272e-02,
        -1.4477e-03, -3.6469e-02,  2.4597e-02,  5.1422e-02, -1.2964e-01,
        -2.3697e-02, -4.1595e-02,  1.4732e-02,  1.2581e-02,  1.9485e-02,
         8.2886e-02, -3.2562e-02,  6.4735e-03,  2.5482e-02,  4.3793e-03,
         1.7395e-02, -3.5004e-02,  5.4962e-02, -3.7292e-02, -1.3533e-03,
         4.7607e-03,  6.2317e-02,  4.3915e-02,  3.4454e-02, -4.1504e-02,
        -3.2623e-02,  5.7716e-03,  2.7893e-02,  3.2288e-02, -3.5496e-03,
        -5.9509e-03, -2.0477e-02,  1