In [2]:

from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
import re
import pandas as pd

In [3]:
# 2️⃣ Optional: Function to clean captions
def clean_caption(text):
    """
    Preprocesses a text string:
    - Converts to lowercase
    - Removes URLs
    - Removes special characters
    """
    text = text.lower()
    text = re.sub(r'http\S+', '', text)          # Remove links
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special chars
    return text.strip()

In [4]:
# 3️⃣ Sample reel captions
reel_captions = [
    "Funny cat fails compilation",
    "Cat funny video",
    "Cooking tutorial: pasta carbonara",
    "Football skills and goals",
    "Python programming tips for beginners",
    "Morning yoga and meditation routine",
    "Travel vlog: exploring Paris streets",
    "Workout motivation for gym lovers",
    "ASMR satisfying sounds",
    "Street food review in Delhi"
]

# Apply preprocessing (optional)
reel_captions = [clean_caption(caption) for caption in reel_captions]

In [5]:
# 4️⃣ Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# 5️⃣ Generate embeddings for all captions
embeddings = model.encode(reel_captions, convert_to_tensor=True)

# 6️⃣ Compute cosine similarity between all pairs
similarity_matrix = util.cos_sim(embeddings, embeddings).cpu().numpy()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# 🔹 NEW: Show numbered similarity matrix
print("\nCosine Similarity Matrix (Numbered)")
matrix_df = pd.DataFrame(similarity_matrix,
                         index=[f"{i}" for i in range(len(reel_captions))],
                         columns=[f"{j}" for j in range(len(reel_captions))])
print(matrix_df.round(2))

print("\nCaption Index Reference:")
for i, caption in enumerate(reel_captions):
    print(f"{i}: {caption}")


===== Cosine Similarity Matrix (Numbered) =====
      0     1     2     3     4     5     6     7     8     9
0  1.00  0.64  0.13 -0.05  0.09 -0.04  0.05 -0.06  0.16  0.01
1  0.64  1.00  0.12  0.08  0.06  0.00  0.14  0.00  0.13  0.03
2  0.13  0.12  1.00  0.06  0.21  0.06  0.06  0.04  0.15  0.28
3 -0.05  0.08  0.06  1.00  0.11  0.05  0.02  0.20 -0.02  0.01
4  0.09  0.06  0.21  0.11  1.00  0.03  0.10  0.04  0.06 -0.01
5 -0.04  0.00  0.06  0.05  0.03  1.00 -0.01  0.22  0.05  0.03
6  0.05  0.14  0.06  0.02  0.10 -0.01  1.00  0.06  0.03  0.20
7 -0.06  0.00  0.04  0.20  0.04  0.22  0.06  1.00  0.01 -0.05
8  0.16  0.13  0.15 -0.02  0.06  0.05  0.03  0.01  1.00  0.08
9  0.01  0.03  0.28  0.01 -0.01  0.03  0.20 -0.05  0.08  1.00

Caption Index Reference:
0: funny cat fails compilation
1: cat funny video
2: cooking tutorial pasta carbonara
3: football skills and goals
4: python programming tips for beginners
5: morning yoga and meditation routine
6: travel vlog exploring paris streets
7: workou

In [9]:
for i in range(len(reel_captions)):
    watched_embedding = embeddings[i]
    similarities = util.cos_sim(watched_embedding, embeddings)[0]
    similarities[i] = -1 #to exclude itself

    # Get the index of the most similar caption
    top_index = torch.argmax(similarities).item()

    print(f"\nBest Match for: '{reel_captions[i]}'")
    print(f"→ Recommendation: '{reel_captions[top_index]}'")
    print(f"→ Similarity Score: {similarities[top_index].item():.2f}")



Best Match for: 'funny cat fails compilation'
→ Recommendation: 'cat funny video'
→ Similarity Score: 0.64

Best Match for: 'cat funny video'
→ Recommendation: 'funny cat fails compilation'
→ Similarity Score: 0.64

Best Match for: 'cooking tutorial pasta carbonara'
→ Recommendation: 'street food review in delhi'
→ Similarity Score: 0.28

Best Match for: 'football skills and goals'
→ Recommendation: 'workout motivation for gym lovers'
→ Similarity Score: 0.20

Best Match for: 'python programming tips for beginners'
→ Recommendation: 'cooking tutorial pasta carbonara'
→ Similarity Score: 0.21

Best Match for: 'morning yoga and meditation routine'
→ Recommendation: 'workout motivation for gym lovers'
→ Similarity Score: 0.22

Best Match for: 'travel vlog exploring paris streets'
→ Recommendation: 'street food review in delhi'
→ Similarity Score: 0.20

Best Match for: 'workout motivation for gym lovers'
→ Recommendation: 'morning yoga and meditation routine'
→ Similarity Score: 0.22

Bes