In [1]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")  



In [3]:


img_url = 'keyframes/L01_V001/001.jpg' 
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image = Image.open(img_url).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))




a photography of a city skyline with a large logo in the middle
a close up of a city skyline with a large orange sign


In [4]:
text = "In the image includes"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


in the image includes a city skyline and a river with boats


In [5]:

img_url = 'keyframes/L01_V001/016.jpg' 
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image = Image.open(img_url).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

a photography of a flooded street with a truck driving through it


In [6]:
from langchain_community.llms import Ollama


In [7]:
llm = Ollama(model="llama3.1:latest")

In [8]:
prompt = '''
You are an translation assistant for language translation tasks. You need to translate english sentence to vietnamese and response only the Vietnamese result.

English :{english}
Vietnamese: 

'''

In [9]:
def question_llm(english):
  prompt_formatted = prompt.format(english=english)
  res = llm.invoke(prompt_formatted)
  # print(prompt_formatted)
  return res

In [10]:
question_llm("a photography of a flooded street with a truck driving through it")

'Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua.'

In [12]:
from pymilvus.model.hybrid import BGEM3EmbeddingFunction

bge_m3_ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3', # Specify the model name
    device='cpu', # Specify the device to use, e.g., 'cpu' or 'cuda:0'
    use_fp16=False # Specify whether to use fp16. Set to `False` if `device` is `cpu`.
)

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 44651.92it/s]


In [13]:
docs= [f"Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua."]
docs_embeddings = bge_m3_ef.encode_documents(docs)

vectors = docs_embeddings["dense"]

In [14]:
vectors

[array([-0.04253798,  0.00780115, -0.03375866, ...,  0.0280426 ,
        -0.04685987, -0.02310734], dtype=float32)]

In [15]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri="http://localhost:19530"
)

In [16]:
if client.has_collection(collection_name="ai_challenge_collection"):
    client.drop_collection(collection_name="ai_challenge_collection")
client.create_collection(
        collection_name="ai_challenge_collection",
        dimension=1024,  # The vectors we will use in this demo has 768 dimensions
        auto_id=True
)


In [18]:
data = []
data.append({ "vector": vectors[0], "text": docs[0]} )
res = client.insert(collection_name="ai_challenge_collection", data=data)        
print(res)


{'insert_count': 1, 'ids': [452540126240705757]}


In [19]:

res = client.query(
    collection_name="ai_challenge_collection",  # target collection
    filter='',  # number of returned entities
    limit=100,
    output_fields=["id", 'text'],  # specifies fields to be returned
    # sorted=True, #
)
for item in res:
    print(item)

{'id': 452540126240705757, 'text': 'Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua.'}


In [22]:
queries = ["Tìm ảnh thành phố ngập nước có một xe tải đang chạy"]

query_embeddings = bge_m3_ef.encode_queries(queries)

res = client.search(
    collection_name="ai_challenge_collection",  # target collection
    data=[query_embeddings['dense'][0]],  # query vectors
    limit=5,  # number of returned entities
    output_fields=["id","text"],  # specifies fields to be returned
)
context_items = res[0]

print(context_items)



[{'id': 452540126240705757, 'distance': 0.8477668762207031, 'entity': {'text': 'Hình ảnh đường phố bị ngập nước với một xe tải đang chạy qua.', 'id': 452540126240705757}}]
