In [3]:
%pip install pinecone
%pip install sentence_transformers


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import ffmpeg
import io
import numpy as np
import tempfile
from PIL import Image
from sentence_transformers import models, SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Initialize the CLIP model
img_model = SentenceTransformer(modules=[models.CLIPModel()])

In [4]:
def get_scaled_size(width, height):
    target_width = 224
    w_percent = (target_width / float(width))
    h_size = int((float(height) * float(w_percent)))
    return target_width, h_size

In [5]:
def to_byte_array(image):
    """Converts a numpy array image to a byte array"""
    img = Image.fromarray(image)
    with io.BytesIO() as output:
        img.save(output, format="PNG")
        return output.getvalue()

In [6]:
def get_frames(video_path):
    with tempfile.NamedTemporaryFile() as f:
        # Read the video file
        with open(video_path, "rb") as video_file:
            video_bytes = video_file.read()
        
        f.write(video_bytes)
        f.flush()
        
        probe = ffmpeg.probe(f.name, threads=1)
        video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
        width, height = get_scaled_size(int(video_info['width']), int(video_info['height']))

        out, _ = (
            ffmpeg
            .input(f.name, threads=1)
            .filter('scale', width, height)
            .output('pipe:', format='rawvideo', pix_fmt='rgb24')
            .run(capture_stdout=True, capture_stderr=True)
        )
        frames = (
            np
            .frombuffer(out, np.uint8)
            .reshape([-1, height, width, 3])
        )
        
        indexes = np.random.randint(frames.shape[0], size=10)
        return [to_byte_array(frame) for frame in frames[indexes, :]]

In [7]:
def get_embeddings(frames):
    images = [Image.open(io.BytesIO(frame)) for frame in frames]
    vectors = img_model.encode(images)
    return vectors

In [8]:
# Path to your video file
video_path = "video1881698513.mp4"

# Get frames from the video
frames = get_frames(video_path)

# Get embeddings for the frames
embeddings = get_embeddings(frames)

# Print the dimensions of the embeddings
embedding_dimension = embeddings[0].shape[0]
print(f"The dimension of the embeddings is: {embedding_dimension}")

The dimension of the embeddings is: 512


# Don't rerun the cell below. It will restart the database

In [9]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key='44a257b1-5d26-4544-9825-d51ddaaa13e3')

index_name = "tiktok-hackathon"
dimension = 512  # Dimension of CLIP embeddings

if index_name not in pc.list_indexes().names():
  pc.create_index(
      name=index_name,
      dimension=dimension,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
  )

In [10]:
vectors_with_metadata = [
    {
        "id": str(i),
        "values": embedding.tolist(),  # Convert embedding to list if it's a numpy array
    }
    for i, embedding in enumerate(embeddings)
]

In [11]:
index = pc.Index(index_name)
index.upsert(vectors=vectors_with_metadata, namespace="video_embeddings")

{'upserted_count': 10}

# Searching from pinecone

In [12]:
test_vect = img_model.encode('Triangles are very cool objects and a presentation on them is interesting.')

In [13]:
query_results = index.query(
    namespace="video_embeddings",
    vector=test_vect.tolist(),
    top_k=1,
    include_values=True,
    include_metadata=True
)

# Print the query results
print(query_results)

{'matches': [{'id': '7',
              'score': 0.279680938,
              'values': [0.147639453,
                         -0.103531122,
                         -0.122731723,
                         -0.0790869892,
                         -0.658142209,
                         -0.24769868,
                         -0.0634254515,
                         -0.350750536,
                         0.508789659,
                         -0.155952871,
                         0.068514213,
                         0.322130144,
                         0.398772657,
                         0.193546489,
                         -0.118484773,
                         0.0567304343,
                         -0.0542633832,
                         0.115617201,
                         0.0644490123,
                         0.181323558,
                         0.465192318,
                         0.026368618,
                         0.00809099525,
                         0.224028051,
           