# Installing Packaging

In [2]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/zx/46y7zv8x5gd6xfp0s16kr0w00000gp/T/pip-req-build-jo9hl00j
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/zx/46y7zv8x5gd6xfp0s16kr0w00000gp/T/pip-req-build-jo9hl00j
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone


In [1]:
from pkg_resources import packaging
from collections import OrderedDict
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import IPython.display
from os import system
from PIL import Image, ImageTk
import urllib.request
import tkinter as tk
import pandas as pd
import numpy as np
import skimage
import pickle
import torch
import time
import math
import clip
import cv2
import os
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print("Torch version:", torch.__version__)
print("CLIP Models:",clip.available_models())

def pickle_read(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

def pickle_write(a, b):
    pickle_filename = a if len(a) >= 4 and a[-4:] == ".pkl" else b
    data = b if pickle_filename == a else a
    with open(pickle_filename, 'wb') as file:
        pickle.dump(data, file)

Torch version: 2.4.1
CLIP Models: ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


# Loading the model

`clip.available_models()` will list the names of available CLIP models.

In [40]:
model, preprocess = clip.load("ViT-B/32") # CHANGE BACK LATER TO ViT-B/32 CHUNGUST
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


## Building features

We normalize the images, tokenize each text input, and run the forward pass of the model to get the image and text features.

In [3]:
def loadImage(url, rows=5, cols=5):
  original_images = []
  urllib.request.urlretrieve(
    url,
    "img.gif")
  im = Image.open("img.gif")
  try:
    while 1:
      im.seek(im.tell()+1)
      original_images.append(im.convert("RGB"))
  except EOFError:
    pass

  print(len(original_images))

  processed_images = []

  for image in original_images:
    processed_images.append(preprocess(image))

  plt.figure(figsize=(20, 10))
  start = 0
  for i in range(rows*cols):
    plt.subplot(rows, cols, i+1)
    plt.imshow(original_images[start+i])
    plt.axis('off')
    #plt.title(str(start+i))
  plt.tight_layout()
  plt.subplots_adjust(wspace=0, hspace=0, left=0, right=1, bottom=0, top=1)

  return original_images, processed_images

```Python
url = 'https://media1.giphy.com/media/lqdJsUDvJnHBgM82HB/giphy.gif'
texts = ['a whale jumping out of water']
orig_imgs, proc_imgs = loadImage(url,7,10)
findMatch(orig_imgs, proc_imgs, texts)
```

In [2]:
from iv2_utils.iv2 import *
from IPython.display import clear_output
from PIL import Image, ImageSequence
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
import shutil
import cv2
import os

### Augmenting Files
```Python
if 'aug1' in os.listdir('.'):
    shutil.rmtree('aug1')
if 'aug2' in os.listdir('.'):
    shutil.rmtree('aug2')

def get_dim(file_path):
    vid = cv2.VideoCapture(file_path)
    height = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
    return height, width

backflip_files = list(filter(lambda x: x != 'backflip/.DS_Store',[os.path.join('backflip', x) for x in os.listdir('backflip')]))

os.mkdir('aug1')
os.mkdir('aug2')
for backflip in tqdm(backflip_files):
    height, width = get_dim(backflip)
    add_noise(backflip, os.path.join('aug1', backflip.split('/')[1]), '../../../Storage/cruise.png', int(min(height, width) / 3))

for aug1 in tqdm([os.path.join('aug1',x) for x in os.listdir('aug1')]):
    height, width = get_dim(aug1)
    add_noise(aug1, os.path.join('aug2', aug1.split('/')[1]), '../../../Storage/cruise.png', int(min(height, width) / 3))

shutil.rmtree('aug1')
```

In [37]:
cached_images = {}

def loadGifLocal(local_path, rows=5, cols=5):
    if local_path in cached_images: return cached_images[local_path]
    original_images = []

    im = Image.open(local_path)
    
    try:
        im.seek(im.tell())
        original_images.append(im.convert("RGB"))
        while 1:
            im.seek(im.tell()+1)
            original_images.append(im.convert("RGB"))
    except EOFError:
        pass

    processed_images = []

    for image in original_images:
        processed_images.append(preprocess(image))

    cached_images[local_path] = (original_images, processed_images)
    return original_images, processed_images

def loadMP4Local(local_path, rows=5, cols=5):
    video = cv2.VideoCapture(local_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    frames = []
    success, frame = video.read()
    while success:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(Image.fromarray(frame))
        success, frame = video.read()
    video.release()

    original_images = frames

    processed_images = []

    for image in original_images:
        processed_images.append(preprocess(image))

    return original_images, processed_images

def findMatch(original_images, processed_images, texts):
  t1 = time.perf_counter()
  image_input = torch.tensor(np.stack(processed_images))
  text_tokens = clip.tokenize([desc for desc in texts])

  with torch.no_grad():
      image_features = model.encode_image(image_input).float()
      text_features = model.encode_text(text_tokens).float()

  image_features /= image_features.norm(dim=-1, keepdim=True)
  text_features /= text_features.norm(dim=-1, keepdim=True)
  similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
  t2 = time.perf_counter()

  #print(similarity)
  # plt.figure(figsize=(18, 6*len(texts)))
  y_pred = []
  for i, text in enumerate(texts):
    # plt.subplot(len(texts),2,1+2*i)
    # plt.plot(range(len(similarity[i])), similarity[i])
    y_pred.append(similarity[i])
    # plt.title("best match is: "+ str(np.argmax(similarity[i])), fontdict={'fontsize': 40})
    # plt.subplot(len(texts),2,2+2*i)
    # plt.imshow(original_images[np.argmax(similarity[i])])
    # plt.title(text, fontdict={'fontsize': 40})
  # plt.tight_layout()
  return y_pred

# Displaying frames

In [4]:
def showFrames(path, highlight=False):
    global current_pick
    video_path = path
    cap = cv2.VideoCapture(video_path)
    
    frames = []
    success, frame = cap.read()
    while success:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(frame)
        frames.append(img)
        success, frame = cap.read()
    
    cap.release()

    if not frames:
        print("No frames found in the video.")
        return
    
    root = tk.Tk()
    root.title(path)
    currIdx = -1

    frame_anno = []
    def display_frame(index):
        global currIdx
        frame_label.config(text=f"Frame {index}", fg = 'black' if index + 1 not in frame_anno else 'green')
        img = ImageTk.PhotoImage(frames[index])
        frame_canvas.create_image(0, 0, anchor=tk.NW, image=img)
        frame_canvas.image = img
        currIdx = index
    
    frame_label = tk.Label(root, text="Frame 0", font=('Hack', 14), fg="red" if highlight else "black")
    frame_label.pack()
    
    frame_canvas = tk.Canvas(root, width=frames[0].width, height=frames[0].height)
    frame_canvas.pack()
    
    display_frame(0)
    
    filename_entry = tk.Entry(root, font=('Hack', 12))
    filename_entry.pack(pady=10)

    def next_frame(event):
        current_frame = int(frame_label.cget("text").split()[1])
        next_index = (current_frame + 1) % len(frames)
        display_frame(next_index)

    def doubleSkip(event):
        next_frame(event)
        next_frame(event)
    
    def prev_frame(event):
        current_frame = int(frame_label.cget("text").split()[1])
        next_index = (current_frame - 1) % len(frames)
        display_frame(next_index)

    def doublePrev(event):
        prev_frame(event)
        prev_frame(event)

    def restart(event):
        if anno_stock100[-1][0] == path:
            anno_stock100.pop(-1)
            frame_anno = []
            print("Removed previous one and reset frame_anno")
            print(anno_stock100)
        else:
            print("Not same path, ignoring")
    
    def save_frame(event):
        global currIdx
        global anno_stock100

        phrase = filename_entry.get()  # Get the text from the entry box
        anno_stock100.append((path, phrase, frame_anno))
        print("Saved!!")
        print(anno_stock100)
    def add_frame(event):
        global currIdx
        if currIdx + 1 in frame_anno:
            frame_anno.remove(currIdx + 1)
            print("Removed",currIdx + 1,"as a correct frame.")
            display_frame(currIdx)
        else:
            frame_anno.append(currIdx + 1)
            print("Added", currIdx + 1, "as a correct frame.")
            display_frame(currIdx)
    
    root.bind('<Right>', next_frame)
    root.bind('<Left>', prev_frame)
    root.bind('<Up>', doubleSkip)
    root.bind('<Down>', doublePrev)
    root.bind('<Command-s>', save_frame)
    root.bind('<Command-f>', add_frame)
    root.bind('<Command-r>', restart)
    
    root.mainloop()


In [5]:
anno_stock100 = []

In [6]:
from IPython.display import clear_output
for i in range(1, 101):
    showFrames(f"GIF100/{i}.mp4")
    clear_output()

In [62]:
anno_stock100_new

[('GIF100/1.mp4', [19, 20, 21, 22, 23, 24, 25, 26]),
 ('GIF100/2.mp4', [3, 4, 5, 6, 7]),
 ('GIF100/3.mp4',
  [100,
   101,
   102,
   103,
   104,
   105,
   106,
   107,
   108,
   109,
   110,
   111,
   112,
   113,
   114,
   115,
   116,
   117,
   118,
   119,
   120,
   121,
   122,
   123,
   124,
   125,
   126,
   127,
   128,
   129,
   130,
   131,
   132,
   133,
   134,
   135,
   136,
   137,
   139,
   140,
   141,
   142,
   143,
   144,
   145,
   138,
   146,
   147,
   148,
   149,
   150,
   151,
   152,
   153,
   154,
   155,
   156,
   157,
   158,
   159,
   160,
   161,
   162,
   163]),
 ('GIF100/4.mp4',
  [93,
   94,
   95,
   96,
   97,
   98,
   99,
   100,
   101,
   102,
   103,
   104,
   105,
   106,
   107,
   108,
   109,
   110,
   111,
   112,
   113,
   114,
   115,
   116,
   117,
   118,
   119,
   120,
   121,
   122,
   123,
   124,
   125,
   126,
   127,
   128,
   129,
   130,
   131,
   132,
   133,
   134,
   135,
   136,
   137]),
 ('GIF

In [61]:
from moviepy.editor import VideoFileClip

def trim_video(input_path, output_path, start_frame, end_frame):
    video = VideoFileClip(input_path)
    
    fps = video.fps
    start_time = start_frame / fps
    end_time = end_frame / fps
    
    trimmed_clip = video.subclip(start_time, end_time)
    trimmed_clip.write_videofile(output_path, codec='libx264')
    
    video.close()
    trimmed_clip.close()

import cv2

def get_frame_count(video_path):
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        raise ValueError(f"Unable to open video file: {video_path}")
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    cap.release()
    return frame_count


for video, phrase, frames in tqdm(anno_stock100):
    input_video_path = video
    start_frame = max(0, min(frames) - 100)
    end_frame = min(get_frame_count(video), max(frames) + 100)
    anno_stock100_new.append((video, [x - start_frame for x in frames]))

  0%|          | 0/100 [00:00<?, ?it/s]

$$\Large \textbf{Evaluation on 87}$$

-----------

In [73]:
data = pickle_read('rustyjar/GIF87-anno.pkl')
print(data[:5])

[('GIF87/1.mp4', (1, 'woman falls down', 19)), ('GIF87/2.mp4', (2, 'woman falls down', 9)), ('GIF87/3.mp4', (3, 'guy falls down onto couch', 28)), ('GIF87/4.mp4', (4, 'person falls down', 14)), ('GIF87/5.mp4', (5, 'guy falls down', 10))]


In [37]:
for k, v in data:
    if v[1] == "A person jumping over fence":
        print(k)

GIF87/50.mp4


In [35]:
clip_pred = []
pbar = tqdm(data)
for path, video_data in pbar:
    pbar.set_description(f'{path} | {video_data[1]}')
    
    orig_imgs, proc_imgs = loadMP4Local(path)
    result = findMatch(orig_imgs, proc_imgs, [video_data[1]])[0]
    clip_pred.append(result)

pickle_write('jar/CLIP-g100.pkl', clip_pred)
print("Done")

NameError: name 'data' is not defined

--------

In [None]:
orig_imgs, proc_imgs = loadMP4Local('augment/37.mp4')
result = findMatch(orig_imgs, proc_imgs, ['A person performs a backflip.'])[0]
print(result)

---------

$$\Large \color{blue} \textbf{Evaluation on GIF100}$$

---------

In [56]:
video = pickle_read('rustyjar/STOCK100.pkl')

preds  = []
logits = []
pbar = tqdm(video)
for video_path, phrase, frames in pbar:
    pbar.set_description(video_path)
    
    orig_imgs, proc_imgs = loadMP4Local(video_path)
    result = findMatch(orig_imgs, proc_imgs, [phrase])[0]
    # clip_bar = tqdm(range(0, len(orig_imgs)))
    # for i in clip_bar:
    #     clip_pred.append(result)
    prediction = np.argmax(result) + 1
    min_diff = 10000000
    min_val = -1
    for k in frames:
        if abs(k - prediction) < min_diff:
            min_diff = abs(k - prediction)
            min_val = k
    #     clip_bar.set_description(str(np.argmax(clip_pred) + 1) + " vs " + str(min_val))
        
    logits.append(list(zip(result, list(range(1, len(result) + 1)))))
    preds.append(prediction)
    print(video_path,"|",prediction, "vs", min_val)
    # logits.append(result.tolist())

pickle_write(preds, 'jar/CLIP-s.pkl')
pickle_write(logits, 'rustyjar/CLIP-logits-s.pkl')

  0%|          | 0/100 [00:00<?, ?it/s]

GIF100/1.mp4 | 27 vs 26
GIF100/2.mp4 | 27 vs 7
GIF100/3.mp4 | 16 vs 100
GIF100/4.mp4 | 166 vs 137
GIF100/5.mp4 | 71 vs 26
GIF100/6.mp4 | 18 vs 21
GIF100/7.mp4 | 118 vs 118
GIF100/8.mp4 | 103 vs 103
GIF100/9.mp4 | 122 vs 122
GIF100/10.mp4 | 64 vs 77
GIF100/11.mp4 | 129 vs 129
GIF100/12.mp4 | 57 vs 56
GIF100/13.mp4 | 87 vs 87
GIF100/14.mp4 | 101 vs 101
GIF100/15.mp4 | 6 vs 100
GIF100/16.mp4 | 107 vs 97
GIF100/17.mp4 | 5 vs 68
GIF100/18.mp4 | 36 vs 23
GIF100/19.mp4 | 1 vs 100
GIF100/20.mp4 | 91 vs 91
GIF100/21.mp4 | 97 vs 97
GIF100/22.mp4 | 57 vs 57
GIF100/23.mp4 | 164 vs 114
GIF100/24.mp4 | 27 vs 100
GIF100/25.mp4 | 94 vs 100
GIF100/26.mp4 | 97 vs 97
GIF100/27.mp4 | 25 vs 100
GIF100/28.mp4 | 58 vs 60
GIF100/29.mp4 | 106 vs 106
GIF100/30.mp4 | 101 vs 101
GIF100/31.mp4 | 125 vs 125
GIF100/32.mp4 | 168 vs 92
GIF100/33.mp4 | 1 vs 24
GIF100/34.mp4 | 40 vs 40
GIF100/35.mp4 | 116 vs 116
GIF100/36.mp4 | 82 vs 82
GIF100/37.mp4 | 125 vs 84
GIF100/38.mp4 | 15 vs 100
GIF100/39.mp4 | 12 vs 12
GIF100/

In [53]:
result

array([0.31079316, 0.3119359 , 0.30822718, 0.30780143, 0.31798947,
       0.30971524, 0.31051314, 0.30868042, 0.30982378, 0.29493713,
       0.30657247, 0.29243797, 0.30772418, 0.2893455 , 0.30935946,
       0.30904633, 0.29334462, 0.31045914, 0.32010165, 0.27743813,
       0.3118668 , 0.30295557, 0.33003002, 0.33661008, 0.32735896,
       0.3261555 , 0.34839472, 0.343634  , 0.31699622, 0.3124323 ,
       0.3226238 , 0.3209635 , 0.32018015, 0.305503  , 0.31034577,
       0.3072469 , 0.30885175, 0.30804783, 0.30163264, 0.3080399 ,
       0.29930627, 0.32747525, 0.31097034, 0.32093716, 0.32138923,
       0.2962514 , 0.3084373 , 0.3217888 , 0.31838557, 0.31221905,
       0.32261097, 0.31490296, 0.3051491 , 0.30581164, 0.30037546,
       0.29266268, 0.29183882, 0.30280113, 0.30550587, 0.29816192,
       0.29504663, 0.28125608, 0.30376306, 0.30223098, 0.2947669 ,
       0.29666013, 0.29640347, 0.29348466, 0.3064711 , 0.28229964,
       0.29354176, 0.30135766, 0.3084244 , 0.29784214, 0.29556

In [47]:
clip_pred

[array([0.31079322, 0.3119359 , 0.30822718, 0.30780143, 0.31798953,
        0.30971527, 0.3105131 , 0.30868047], dtype=float32),
 array([0.3119359 , 0.30822718, 0.30780143, 0.31798953, 0.30971527,
        0.3105131 , 0.30868047, 0.30982378], dtype=float32),
 array([0.30822718, 0.30780143, 0.31798953, 0.30971527, 0.3105131 ,
        0.30868047, 0.30982378, 0.29493716], dtype=float32),
 array([0.30780143, 0.31798953, 0.30971527, 0.3105131 , 0.30868047,
        0.30982378, 0.29493716, 0.30657244], dtype=float32),
 array([0.31798953, 0.30971527, 0.3105131 , 0.30868047, 0.30982378,
        0.29493716, 0.30657244, 0.29243797], dtype=float32),
 array([0.30971527, 0.3105131 , 0.30868047, 0.30982378, 0.29493716,
        0.30657244, 0.29243797, 0.3077242 ], dtype=float32),
 array([0.3105131 , 0.30868047, 0.30982378, 0.29493716, 0.30657244,
        0.29243797, 0.3077242 , 0.28934547], dtype=float32),
 array([0.30868047, 0.30982378, 0.29493716, 0.30657244, 0.29243797,
        0.3077242 , 0.2893454

$$\Large \textbf{Evaluation on Augmented Data}$$

--------

```Python
clip_pred = []

backflip_files = os.listdir('augment')
backflip_files.sort(key = lambda x: int(x.split('.')[0]))
backflip_files = [os.path.join('augment', x) for x in backflip_files]

pbar = tqdm(backflip_files)
for video in pbar:
    pbar.set_description(video.split('/')[1])
    orig_imgs, proc_imgs = loadMP4Local(video)
    result = findMatch(orig_imgs, proc_imgs, ['A person performs a backflip.'])[0]
    
    clip_pred.append(result)
    print(str(result), end = " | ")

with open('predictions (updated)/CLIP_pred_aug.pkl', 'wb') as file:
    pickle.dump([x.item() for x in clip_pred], file)

print("Done!")
```

# Loading Data from `photo_data.csv`

In [27]:
data = pickle_read("rustyjar/GIF87.pkl")
result = [print(data[x]) for x in range(5)]

('GIF87/1.mp4', (1, 'woman falls down', 19))
('GIF87/2.mp4', (2, 'woman falls down', 9))
('GIF87/3.mp4', (3, 'guy falls down onto couch', 28))
('GIF87/4.mp4', (4, 'person falls down', 14))
('GIF87/5.mp4', (5, 'guy falls down', 10))


```Python
X_lengths = []

X_frames = []

for gif_path, phrase, frame in tqdm(labeled_data):
    orig_imgs, proc_imgs = loadImageLocal(gif_path)
    X_lengths.append(len(orig_imgs))
    curr_frames = []
    for f_index, frame_pic in enumerate(orig_imgs):
        append = ""
        if f_index < frame:
            append = " before" * min(50, frame - f_index)
        elif f_index > frame:
            append = " after" * min(50, f_index - frame)
        else:
            append = ""
        after = ""
        if f_index == frame:
            after = ' now'
        curr_frames.append((frame_pic, append + phrase + after))
    X_frames.append(curr_frames)

X_lengths = np.array(X_lengths)

y_error = np.divide(np.abs(y_pred - photo_data_csv['Correct Frame'][0:272]), X_lengths) * 100
plt.boxplot(y_error)
plt.yticks(np.arange(0, 100, 10), np.char.add(np.arange(0, 100, 10).astype("str"), '%'))
plt.title("Error in % off")
plt.show()
```

In [84]:
from concurrent.futures import ThreadPoolExecutor
from time import sleep

def compute(x):
    sleep(1)
    return x

inputs = tqdm([x for x in range(30)])
t1 = time.perf_counter()
with ThreadPoolExecutor() as executor:
    results = list(executor.map(compute, inputs))
t2 = time.perf_counter()

print(results)
print(t2 - t1)

  0%|          | 0/30 [00:00<?, ?it/s]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3.0147402089787647


In [85]:
import torch
from transformers import VideoMAEFeatureExtractor, VideoCLIPModel, CLIPProcessor

import cv2
import numpy as np

# Load VideoCLIP model and processor
model = VideoCLIPModel.from_pretrained("microsoft/videoclip-base-finetuned")
processor = CLIPProcessor.from_pretrained("microsoft/videoclip-base-finetuned")

# Function to load video and extract frames
def load_video(video_path, num_frames=16):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = total_frames // num_frames
    
    for i in range(0, total_frames, step):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
        if len(frames) == num_frames:
            break
    cap.release()
    return frames

# Preprocess video
def preprocess_video(video_frames):
    video = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in video_frames]
    video = np.array(video)  # Convert list of frames to array
    inputs = processor(videos=video, return_tensors="pt", padding=True)
    return inputs

# Load your video and text
video_path = "path_to_your_video.mp4"
text = "A person is performing a backflip"

video_frames = load_video(video_path)
inputs = preprocess_video(video_frames)

# Encode the text
text_inputs = processor(text=[text], return_tensors="pt", padding=True)

# Pass the video and text through the model
with torch.no_grad():
    video_embeddings = model.get_video_features(**inputs)
    text_embeddings = model.get_text_features(**text_inputs)

    # Calculate similarity (cosine similarity)
    similarity = torch.nn.functional.cosine_similarity(video_embeddings, text_embeddings)

# Output the similarity score
print("Similarity Score:", similarity.item())


ImportError: cannot import name 'VideoCLIPModel' from 'transformers' (/opt/miniconda3/envs/xclip/lib/python3.9/site-packages/transformers/__init__.py)

In [23]:
from PIL import Image

width, height = 6, 1
image = Image.new('RGB', (width, height))

pixels = [(42, 0, 0),  # Red
          (85, 0, 0),  # Green
          (127, 0, 0),  # Blue
          (170, 0, 0), # Yellow
          (212, 0, 0), # Magenta
          (255, 0, 0)] # Cyan

for x in range(width):
    image.putpixel((x, 0), pixels[x])

image.save('1_row_image.png')
image.show()