Feedback to shins777@gmail.com

구글은 이미지해석, 생성 등의 멀티모달이 가능한 모델을 제공합니다.
이런 처리를 위해서 멀티모달 임베딩이 필요하며, 이 Colab 은 멀티모달 임베딩을 할 수 있는 예제입니다.

In [None]:
!pip install google-cloud-aiplatform --quiet

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import aiplatform
aiplatform.init(project="ai-hangsik")

# Embedding

In [None]:
# Copyright 2023 Google LLC.
# SPDX-License-Identifier: Apache-2.0
from absl import app
from absl import flags
import base64
# Need to do pip install google-cloud-aiplatform for the following two imports.
# Also run: gcloud auth application-default login.
from google.cloud import aiplatform
from google.protobuf import struct_pb2
import sys
import time
import typing


class EmbeddingResponse(typing.NamedTuple):
  text_embedding: typing.Sequence[float]
  image_embedding: typing.Sequence[float]


In [None]:
def get_embedding(text : str = None, image_bytes : bytes = None):

  project = "ai-hangsik"
  location = "us-central1"

  api_regional_endpoint = "us-central1-aiplatform.googleapis.com"
  client_options = {"api_endpoint": api_regional_endpoint}
  # Initialize client that will be used to create and send requests.
  # This client only needs to be created once, and can be reused for multiple requests.
  client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

  if not text and not image_bytes:
    raise ValueError('At least one of text or image_bytes must be specified.')

  instance = struct_pb2.Struct()
  if text:
    instance.fields['text'].string_value = text

  if image_bytes:
    encoded_content = base64.b64encode(image_bytes).decode("utf-8")
    image_struct = instance.fields['image'].struct_value
    image_struct.fields['bytesBase64Encoded'].string_value = encoded_content

  instances = [instance]
  endpoint = (f"projects/{project}/locations/{location}"
    "/publishers/google/models/multimodalembedding@001")
  response = client.predict(endpoint=endpoint, instances=instances)

  text_embedding = None
  if text:
    text_emb_value = response.predictions[0]['textEmbedding']
    text_embedding = [v for v in text_emb_value]

  image_embedding = None
  if image_bytes:
    image_emb_value = response.predictions[0]['imageEmbedding']
    image_embedding = [v for v in image_emb_value]

  return EmbeddingResponse(
    text_embedding=text_embedding,
    image_embedding=image_embedding)

## Image Embedding

In [None]:
#image_file = "./rabbit.jpg"
image_file ="./cat_bike.png"

image_file_contents = None

with open(image_file, "rb") as f:
    image_file_contents = f.read()

response = get_embedding(text=None, image_bytes=image_file_contents)

embeded_image =response.image_embedding

## Text Embedding & Similarity

In [None]:
text_value = "고양이가 오토바이를 탑니다."
text_value = "고양이가 오토바이를 탑니다. 도로 주위 나무는 단풍이 들었습니다."
# text_value = "파란눈의 고양이가 오토바이를 탑니다. 도로 주위 나무는 단풍이 들었습니다."
# text_value = "파란 하늘날 파란눈의 고양이가 오토바이를 탑니다. 도로 주위 나무는 단풍이 들었습니다."
# text_value = "파란 하늘날 파란눈의 고양이가 오토바이를 탑니다. 도로위에 흰줄이 있고 주위 나무는 단풍이 들었습니다."

response = get_embedding(text=text_value, image_bytes=None)

embeded_text = response.text_embedding

from sklearn.metrics.pairwise import cosine_similarity

similarity_ratio = cosine_similarity([embeded_text], [embeded_image])[0][0]
print(similarity_ratio)




0.17430826988890968
