In [1]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")  



In [32]:


img_url = 'keyframes/L10_V017/040.jpg' 
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image = Image.open(img_url).convert('RGB')

# conditional image captioning
text = "a sence in"
sence = processor(raw_image, text, return_tensors="pt")
sence_out = model.generate(**sence)
print(processor.decode(sence_out[0], skip_special_tokens=True))

text = "there are"
objects = processor(raw_image, text, return_tensors="pt")
objects_out = model.generate(**objects)
print(processor.decode(objects_out[0], skip_special_tokens=True))

text = "the text"
text_in_image = processor(raw_image, text, return_tensors="pt")
text_in_image_out = model.generate(**text_in_image)
print(processor.decode(text_in_image_out[0], skip_special_tokens=True))


a sence in a park with a statue of a man on a stage
there are many people watching a statue of a man on a stage
the text reads,'the statue of king kong is being displayed at the entrance of a temple


In [4]:
text = "In the image includes"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


in the image includes a city skyline and a river with boats


In [5]:

img_url = 'keyframes/L01_V001/016.jpg' 
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image = Image.open(img_url).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

a photography of a flooded street with a truck driving through it


In [6]:
from langchain_community.llms import Ollama


In [7]:
llm = Ollama(model="llama3.1:latest")

In [8]:
prompt = '''
You are an translation assistant for language translation tasks. You need to translate english sentence to vietnamese and response only the Vietnamese result.

English :{english}
Vietnamese: 

'''

In [9]:
def question_llm(english):
  prompt_formatted = prompt.format(english=english)
  res = llm.invoke(prompt_formatted)
  # print(prompt_formatted)
  return res

In [10]:
question_llm("a photography of a flooded street with a truck driving through it")

'Một bức ảnh chụp một con đường bị lụt với một xe tải đang đi qua nó.'

In [11]:
from pymilvus.model.hybrid import BGEM3EmbeddingFunction

bge_m3_ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3', # Specify the model name
    device='cpu', # Specify the device to use, e.g., 'cpu' or 'cuda:0'
    use_fp16=False # Specify whether to use fp16. Set to `False` if `device` is `cpu`.
)

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 236077.15it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [12]:
docs= [f"Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua."]
docs_embeddings = bge_m3_ef.encode_documents(docs)

vectors = docs_embeddings["dense"]

In [13]:
vectors

[array([-0.04253798,  0.00780115, -0.03375866, ...,  0.0280426 ,
        -0.04685987, -0.02310734], dtype=float32)]

In [14]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri="http://localhost:19530"
)

In [15]:
if client.has_collection(collection_name="ai_challenge_collection"):
    client.drop_collection(collection_name="ai_challenge_collection")
client.create_collection(
        collection_name="ai_challenge_collection",
        dimension=1024,  # The vectors we will use in this demo has 768 dimensions
        auto_id=True
)


In [16]:
data = []
data.append({ "vector": vectors[0], "text": docs[0]} )
res = client.insert(collection_name="ai_challenge_collection", data=data)        
print(res)


{'insert_count': 1, 'ids': [452540126240705763]}


In [17]:

res = client.query(
    collection_name="ai_challenge_collection",  # target collection
    filter='',  # number of returned entities
    limit=100,
    output_fields=["id", 'text'],  # specifies fields to be returned
    # sorted=True, #
)
for item in res:
    print(item)

In [18]:
queries = ["Tìm ảnh thành phố ngập nước có một xe tải đang chạy"]

query_embeddings = bge_m3_ef.encode_queries(queries)

res = client.search(
    collection_name="ai_challenge_collection",  # target collection
    data=[query_embeddings['dense'][0]],  # query vectors
    limit=5,  # number of returned entities
    output_fields=["id","text"],  # specifies fields to be returned
)
context_items = res[0]

print(context_items)



[{'id': 452540126240705763, 'distance': 0.8477668762207031, 'entity': {'id': 452540126240705763, 'text': 'Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua.'}}]


In [37]:
def detect_caption(img_url):
  
  # img_url = 'keyframes/L01_V001/016.jpg' 
  # img_url = 'keyframes/L10_V017/040.jpg' 
  # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
  raw_image = Image.open(img_url).convert('RGB')

  # conditional image captioning
  text = "a sence in"
  sence = processor(raw_image, text, return_tensors="pt")
  sence_out = model.generate(**sence)
  # print(processor.decode(sence_out[0], skip_special_tokens=True))

  text = "there are"
  objects = processor(raw_image, text, return_tensors="pt")
  objects_out = model.generate(**objects)
  # print(processor.decode(objects_out[0], skip_special_tokens=True))

  # text = "the text"
  # text_in_image = processor(raw_image, text, return_tensors="pt")
  # text_in_image_out = model.generate(**text_in_image)
  # print(processor.decode(text_in_image_out[0], skip_special_tokens=True))
  caption_english = f"{processor.decode(sence_out[0], skip_special_tokens=True)}. {processor.decode(objects_out[0], skip_special_tokens=True)}"
  return caption_english


In [38]:
import os
import csv
import pandas as pd


def save_to_file(folder, keyframe_info):
  # Define the output CSV file path using the folder name
  csv_file_path = os.path.join('captions', f"{folder}.csv")
  df = pd.DataFrame(keyframe_info)
  df.to_csv(csv_file_path, index=False)  # index=False prevents the index from being written to the file


  print(f"Data from folder '{folder}' has been saved to {csv_file_path}")

In [39]:

# Define the root directory
root_dir = 'keyframes'
captions_dir = 'captions'

# Initialize a list to store the collected information
os.listdir(root_dir)

# Loop through all folders and images
for folder in sorted(os.listdir(root_dir)):
    csv_file_path = os.path.join(captions_dir, f"{folder}.csv")
    if os.path.exists(csv_file_path):
        print(f"CSV file for folder '{folder}' already exists. Skipping...")
        continue  # Skip to the next folder if CSV file already exists
    keyframe_info = []
    folder_path = os.path.join(root_dir, folder)
    if os.path.isdir(folder_path):
        for image in sorted(os.listdir(folder_path)):
            if image.endswith('.jpg'):
                img_url = f"keyframes/{folder}/{image}"
                caption = detect_caption(img_url)
                new_item = {
                    "folder": folder,
                    "frame": image,
                    "caption": caption
                }
                keyframe_info.append(new_item)
                print(new_item)
                
        save_to_file(folder,keyframe_info)

        print(keyframe_info)


{'folder': 'L01_V001', 'frame': '001.jpg', 'caption': 'a sence in a city with a sunset and a city skyline. there are a lot of boats in the water at sunset'}
{'folder': 'L01_V001', 'frame': '002.jpg', 'caption': 'a sence in a news studio with a man and a woman. there are two men sitting at a table in a news studio'}
{'folder': 'L01_V001', 'frame': '003.jpg', 'caption': 'a sence in a news studio with a city in the background. there are two men standing on a news set with a city in the background'}
{'folder': 'L01_V001', 'frame': '004.jpg', 'caption': 'a sence in a red and white striped background with a clock. there are two clocks on a red and white striped background'}
{'folder': 'L01_V001', 'frame': '005.jpg', 'caption': 'a sence in a flooded area with a man sitting on a motorcycle. there are two people sitting on a bench in the middle of a flooded street'}
{'folder': 'L01_V001', 'frame': '006.jpg', 'caption': 'a sence in a river with a boat in the water. there are a lot of trees that 