## Google Text Embedding models

* Reference : https://docs.cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api

### Install and configuration

In [None]:
%pip install --upgrade --quiet google-genai \
                                numpy \
                                scipy \
                                pandas

In [None]:
#Set environment variables
PROJECT_ID = "ai-hangsik" 
REGION = "us-central1"
USE_VERTEX_AI = True 


In [None]:
!gcloud auth application-default login
!gcloud auth application-default set-quota-project {PROJECT_ID}

### Execution

In [None]:
from google import genai
from google.genai.types import EmbedContentConfig

import time

In [None]:
# Login to Vertex AI
client = genai.Client(
    vertexai=USE_VERTEX_AI,
    project=PROJECT_ID,
    location=REGION,)

In [None]:
# Calculate cosine similarity between two embedding arrays
def cosine_similarity(embed_1, embed_2):
  import numpy as np
  from scipy.spatial.distance import cosine

  embedding_1 = np.array(embed_1)
  embedding_2 = np.array(embed_2)

  cosine_similarity = 1 - cosine(embedding_1, embedding_2)
  print(f"Cosine similarity : {cosine_similarity:.4f}")


### Google Text Embedding models
* Manual web site : https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models
* Related to task types: https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/task-type-embedding.ipynb

In [None]:
# Generate embedding using text-multilingual
def gemini_embedding_func(model:str, 
                          contents,
                          task_type:str="SEMANTIC_SIMILARITY",    
                          output_dimensionality:int=768,
                          ):
  
        start_time = time.perf_counter_ns()

        # https://googleapis.github.io/python-genai/genai.html#genai.types.EmbedContentConfig
        embed_config = EmbedContentConfig(
                auto_truncate=True,
                
                # task types ref : https://docs.cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#parameter-list
                
                task_type=task_type,  
                
                mime_type="text/plain",
                
                output_dimensionality=output_dimensionality,  
                
                # title="title of the text" # when task type is RETRIEVAL_DOCUMENT
        )

        result = client.models.embed_content(
                model=model,
                contents=contents,
                config=embed_config
        )

        end_time = time.perf_counter_ns()

        latency = (end_time - start_time)
        print(f"Latency (ns): {latency*1e-6:.2f} ms")

        return result.embeddings[0].values

#### text-multilingual-embedding-002

In [None]:
MODEL = "text-multilingual-embedding-002"

CONTENT_1 = "고양이가 자전거를 타고 간다"
CONTENT_2 = "호랑이가 차를 차고 가고 있고 고양이도 자전거를 타고 뒤따르고 있다"

embed_1 = gemini_embedding_func(model = MODEL, 
                                task_type="SEMANTIC_SIMILARITY", 
                                output_dimensionality=768,  
                                contents = CONTENT_1)

embed_2 = gemini_embedding_func(model = MODEL, 
                                task_type="SEMANTIC_SIMILARITY", 
                                output_dimensionality=768,  
                                contents = CONTENT_2)

cosine_similarity(embed_1, embed_2)

#### gemini embedding

* https://arxiv.org/pdf/2503.07891


In [None]:
MODEL = "gemini-embedding-001"

CONTENT_1 = "하이라키 마지막 회 틀어줘"
CONTENT_2 = "하이라이트 마지막에 틀어 줘"

embed_1 = gemini_embedding_func(model = MODEL, 
                                task_type="SEMANTIC_SIMILARITY", 
                                output_dimensionality=3072,  
                                contents = CONTENT_1)

embed_2 = gemini_embedding_func(model = MODEL, 
                                task_type="SEMANTIC_SIMILARITY", 
                                output_dimensionality=3072,  
                                contents = CONTENT_2)

cosine_similarity(embed_1, embed_2)

### Find similar texts

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

def find_similar_texts(query_text, df, embedding_column='embedding', text_column='text', top_k=5):
    # Generate embedding for query text
    query_embedding = gemini_embedding_func(
        model=MODEL,
        task_type="SEMANTIC_SIMILARITY",
        output_dimensionality=3072,
        contents=query_text
    )
    
    # Calculate similarities
    similarities = []
    for idx, row in df.iterrows():
        similarity = 1 - cosine(query_embedding, row[embedding_column])
        similarities.append({'text': row[text_column], 'similarity': similarity})
    
    # Sort by similarity and get top k results
    results = sorted(similarities, key=lambda x: x['similarity'], reverse=True)[:top_k]
    
    return results

In [None]:
# Example usage:
# 1. Read CSV file (assuming you have a CSV with a 'text' column)
df = pd.read_csv('data/.audio_truth.csv',skipinitialspace=True)
df.info()

In [None]:
# 2. Generate embeddings for all texts

MODEL = "gemini-embedding-001"

df['embedding'] = df['text'].apply(lambda x: gemini_embedding_func(
    model=MODEL,
    task_type="SEMANTIC_SIMILARITY",
    output_dimensionality=3072,
    contents=x
))

In [None]:
# 3. Find similar texts for a query
query = "오징어 게임 있어?"
similar_texts = find_similar_texts(query, df)

search_results = []
# 4. Print results
for result in similar_texts:
    
    search_results.append({
        "text": result['text'],
        "similarity": f"{result['similarity']:.4f}"
    })

search_results

In [None]:
MODEL = "gemini-2.5-flash-lite"

PROMPT = f"""
    당신은 사용자의 질문을 이해해서 정확한 질문의 의도를 바탕으로 사용자의 질문을 재작성해주는 AI 어시스턴트입니다.
    사용자의 질문 : {query} 과 검색된 유사한 질문들을 참고하여 최대한 사용자의 질문을 반영한 명확한 질문으로 재작성해 주세요.
    유사한 질문들 : {search_results}    

    답변은 아래와 같이 사용자의 질문을 최소화해서 변경 후 재작성 해주세요.
    답변예제 : "최신 개봉 영화 예고편 모음 틀어줘" 
"""
start_time = time.perf_counter_ns()

response = client.models.generate_content(
    model=MODEL,
    contents=PROMPT,
)

end_time = time.perf_counter_ns()

latency = (end_time - start_time)
print(f"{MODEL} Latency (ns): {latency*1e-6:.2f} ms \n")

print(response.text)

## End of Document