In [30]:
import openai
from openai import OpenAI
import os
import base64
import requests
from key import OPENAI_API_KEY
import math
import numpy as np

In [31]:
# Initialize client
api_key=OPENAI_API_KEY
client = OpenAI(api_key=api_key)

In [32]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def list_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

# Get url name
from video import extract_youtube_id
url = extract_youtube_id("https://www.youtube.com/watch?v=L8ECu0f9_kA")

# Get all the image paths
directory = './frames/' + url  # When you run video.py, all the frames located inside frames folder
image_paths = list_files(directory)

In [33]:
directory

'./frames/L8ECu0f9_kA'

In [34]:
# Get System Prompt
f = open("prompt.txt", "r")
prompt_text = f.read()

# Get User Input
user_input = "great moments"

In [35]:
results = []

# Iterate through each image path
for image_path in image_paths:
    # Getting the base64 string
    base64_image = encode_image(image_path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": prompt_text},
            {"role": "user", "content": [{"type": "text", "text": user_input}, {"type": "image_url","image_url": {"url": f"data:image/jpg;base64,{base64_image}", "detail": "low"}}]}
            # user_input text can be removed
        ],
        "max_tokens": 4096
    }

    # Make the API request
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    # Append the result to the list
    results.append(response.json())

In [36]:
# Result Sample
results[0]

{'id': 'chatcmpl-9eRAngSkb7z88XitfGN7CUZc4vo7d',
 'object': 'chat.completion',
 'created': 1719424601,
 'model': 'gpt-4o-2024-05-13',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '**Caption:** "In a thrilling badminton match, two players intensely compete with one player preparing to return a shot with a swift backhand. The scoreboard in the background indicates the match\'s competitive nature, with closely matched scores highlighting the high stakes of the game."'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 1116,
  'completion_tokens': 52,
  'total_tokens': 1168},
 'system_fingerprint': 'fp_4008e3b719'}

In [40]:
# Iterate through all the results
for i in range(len(image_paths)):
    try:
        print(i, ": ", results[i]['choices'][0]['message']['content'])
    except KeyError:
        print(i, ": missing")
        
# for i in range(len(image_paths)):
#     print(i, ": ", results[i]['choices'][0]['message']['content'])

0 :  **Caption:** "In a thrilling badminton match, two players intensely compete with one player preparing to return a shot with a swift backhand. The scoreboard in the background indicates the match's competitive nature, with closely matched scores highlighting the high stakes of the game."
1 :  **Caption:** "In a high-stakes match point scenario, two badminton players intensely engage in a fast-paced rally, with one preparing to return a powerful shot. The scoreboard in the background emphasizes the critical nature of the moment, showcasing the tight scores and the competitive atmosphere of the championship match."
2 :  "Lee Zii Jia Incredible Shots"
3 :  **Caption:** "In a highlight moment of the match, two badminton players are intensely engaged in a rally. The scoreboard shows a tight score, reflecting the high stakes and competitive spirit of the game. The atmosphere is charged with anticipation as fans watch closely."
4 :  "An intense badminton match captures the critical moment

In [42]:
embedding_model = 'text-embedding-3-small'
def get_embedding(text, model=embedding_model):
   return client.embeddings.create(input = [text], model=model).data[0].embedding

embedded_frame = {}
for i in range(len(results)):
    embedded_frame[i] = get_embedding(results[i]['choices'][0]['message']['content'], model=embedding_model)
    embedded_frame[i] = np.array(embedded_frame[i]).reshape(1,-1)

embedded_query = get_embedding(user_input, model=embedding_model)
embedded_query = np.array(embedded_query).reshape(1,-1)

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_result = {}
for i in range(len(image_paths)):
    similarity_result[i] = cosine_similarity(embedded_frame[i], embedded_query)

ranked_frame_dict = dict(sorted(similarity_result.items(), key=lambda x:x[1]))

In [44]:
ranked_frame_list = list(ranked_frame_dict.keys())
ranked_frame_list[-10:]

[43, 144, 127, 66, 10, 29, 96, 60, 119, 79]

In [45]:
for i in ranked_frame_list[-10:]:
    print(results[i]['choices'][0]['message']['content'])

**Caption:** "A dynamic moment in a badminton match as a player leaps into the air to execute a powerful smash, displaying agility and precision under the arena's bright lights."
"Two coaches enthusiastically applaud during a pivotal moment in a badminton match, expressing their support and encouragement for their player, with the scoreboard showing a close and competitive score."
***"Two badminton players fiercely compete on the court, with one player lunging forward in an intense effort to return the shuttlecock. The intensity of the match and the agility of the players are clearly showcased in this crucial moment."***
"A badminton player, with a focused expression, is captured during a pivotal moment on the court, indicating an intense and crucial point in the match."
"Two team members energetically celebrate a critical point during a match, displaying their excitement and support from the sidelines. The scoreboard in the background indicates a competitive game in progress."
"During