## Create Vector Search Dataset

* Reference : https://docs.cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api

### Install and configuration

In [None]:
%pip install --upgrade --quiet google-genai \
                                numpy \
                                scipy \
                                pandas

In [None]:
#Set environment variables
PROJECT_ID = "ai-hangsik" 
REGION = "us-central1"
USE_VERTEX_AI = True 


In [None]:
!gcloud auth application-default login
!gcloud auth application-default set-quota-project {PROJECT_ID}

### Execution

In [None]:
import time
import numpy as np
import pandas as pd

from google import genai

import embedding as embed_utils

# Login to Vertex AI
client = genai.Client(
    vertexai=USE_VERTEX_AI,
    project=PROJECT_ID,
    location=REGION,)

In [None]:
# 1. Read CSV file (assuming you have a CSV with a 'text' column)
df = pd.read_csv('../embeddings/data/.audio_truth.csv',skipinitialspace=True)
df.info()

In [None]:
# 2. Generate embeddings for all texts
# MODEL = "text-multilingual-embedding-002"
MODEL = "gemini-embedding-001"

df['feature_vector'] = df['text'].apply(lambda x: embed_utils.gemini_embedding_func(
    client=client,
    model=MODEL,
    task_type="SEMANTIC_SIMILARITY",
    output_dimensionality=3072,
    contents=x
))

In [None]:
df

In [None]:
# 3. Find similar texts for a query
MODEL = "gemini-embedding-001"
QUERY = "오징어 있어?"

similar_texts = embed_utils.find_similar_texts(client, MODEL, QUERY, df)

search_results = []
# 4. Print results
for result in similar_texts:
    
    search_results.append({
        "text": result['text'],
        "similarity": f"{result['similarity']:.4f}"
    })

search_results

In [None]:
MODEL = "gemini-2.5-flash-lite"

PROMPT = f"""
    당신은 사용자의 질문을 이해해서 정확한 질문의 의도를 바탕으로 사용자의 질문을 재작성해주는 AI 어시스턴트입니다.
    사용자의 질문 : {QUERY} 과 검색된 유사한 질문들을 참고하여 최대한 사용자의 질문을 반영한 명확한 질문으로 재작성해 주세요.
    유사한 질문들 : {search_results}    

    답변은 아래와 같이 사용자의 질문을 최소화해서 변경 후 재작성 해주세요.
    답변예제 : "최신 개봉 영화 예고편 모음 틀어줘" 
"""
start_time = time.perf_counter_ns()

response = client.models.generate_content(
    model=MODEL,
    contents=PROMPT,
)

end_time = time.perf_counter_ns()

latency = (end_time - start_time)
print(f"{MODEL} Latency (ns): {latency*1e-6:.2f} ms \n")

print(response.text)

### Generate Dataset for Vector Search Index

In [None]:
import json

# read product-embs.json and put them to a list

datapoints = []
for index, item in df.iterrows():
    id = str(item["datapoint_id"])
    metadata = item["text"]
    embedding = item["feature_vector"]

    datapoints.append(
        {
            "datapoint_id": str(id), 
            "feature_vector": embedding,
            "embedding_metadata": 
                {"text": metadata},
        }     
    )

datapoints



In [None]:
# Convert the list to a JSON string and write it to a file
with open("vector_search_dataset.json", "w", encoding='utf-8') as jsonl_file:
    
    for item in datapoints:
        json_line = json.dumps(item, ensure_ascii=False)
        print(json_line)
        jsonl_file.write(json_line +'\n')

    # json.dump(datapoints, json_file, ensure_ascii=False, indent=4)


## End of Document