## Create Vector Search Dataset

* Reference : https://docs.cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api

### Install and configuration

In [1]:
%pip install --upgrade --quiet google-genai \
                                numpy \
                                scipy \
                                pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Set environment variables
PROJECT_ID = "ai-hangsik" 
REGION = "us-central1"
USE_VERTEX_AI = True 


In [3]:
!gcloud auth application-default login
!gcloud auth application-default set-quota-project {PROJECT_ID}

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=bu5h3IND05e9LVwWGlqqG4FUbeIbRP&access_type=offline&code_challenge=WfaSqt_fQTj8pZ_A_YIzrhbvMoOqBOuZTvpafOq-1s8&code_challenge_method=S256


Credentials saved to file: [/Users/hangsik/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "ai-hangsik" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.

Credentials saved to file: [/Users/hangsik/.config/gc

### Execution

In [4]:
import time
import numpy as np
import pandas as pd

from google import genai

import embedding as embed_utils

# Login to Vertex AI
client = genai.Client(
    vertexai=USE_VERTEX_AI,
    project=PROJECT_ID,
    location=REGION,)

In [None]:
# 1. Read CSV file (assuming you have a CSV with a 'text' column)
df = pd.read_csv('../embeddings/data/.audio_truth.csv',skipinitialspace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   datapoint_id  52 non-null     int64 
 1   text          52 non-null     object
dtypes: int64(1), object(1)
memory usage: 964.0+ bytes


In [6]:
# 2. Generate embeddings for all texts
# MODEL = "text-multilingual-embedding-002"
MODEL = "gemini-embedding-001"

df['feature_vector'] = df['text'].apply(lambda x: embed_utils.gemini_embedding_func(
    client=client,
    model=MODEL,
    task_type="SEMANTIC_SIMILARITY",
    output_dimensionality=3072,
    contents=x
))

Latency (ns): 2167.24 ms
Latency (ns): 441.41 ms
Latency (ns): 629.52 ms
Latency (ns): 535.77 ms
Latency (ns): 532.72 ms
Latency (ns): 538.42 ms
Latency (ns): 629.34 ms
Latency (ns): 435.02 ms
Latency (ns): 306.27 ms
Latency (ns): 297.98 ms
Latency (ns): 300.82 ms
Latency (ns): 298.54 ms
Latency (ns): 720.41 ms
Latency (ns): 440.05 ms
Latency (ns): 308.89 ms
Latency (ns): 433.57 ms
Latency (ns): 299.61 ms
Latency (ns): 443.49 ms
Latency (ns): 296.40 ms
Latency (ns): 297.48 ms
Latency (ns): 295.04 ms
Latency (ns): 286.09 ms
Latency (ns): 292.69 ms
Latency (ns): 307.88 ms
Latency (ns): 299.06 ms
Latency (ns): 302.04 ms
Latency (ns): 289.91 ms
Latency (ns): 305.98 ms
Latency (ns): 306.12 ms
Latency (ns): 320.57 ms
Latency (ns): 309.60 ms
Latency (ns): 353.46 ms
Latency (ns): 287.26 ms
Latency (ns): 286.13 ms
Latency (ns): 353.03 ms
Latency (ns): 298.14 ms
Latency (ns): 308.76 ms
Latency (ns): 314.65 ms
Latency (ns): 293.70 ms
Latency (ns): 292.95 ms
Latency (ns): 298.43 ms
Latency (ns): 2

In [7]:
df

Unnamed: 0,datapoint_id,text,feature_vector
0,1,하이 LG,"[-0.028963137418031693, 0.012049556709825993, ..."
1,2,하이 TV,"[-0.033387742936611176, -0.0003027681377716362..."
2,3,하이 LG 폭군의 셰프 틀어줘,"[-0.018237600103020668, 0.002341760555282235, ..."
3,4,나는 솔로 이어서 보고 싶어,"[-0.004924905952066183, -0.0019802474416792393..."
4,5,드라마 정년이 1화 틀어줘,"[-0.015421690419316292, 0.0034547001123428345,..."
5,6,오징어 게임 시즌 2 지금 바로 재생해 줘,"[-0.0038162621203809977, -0.008096965029835701..."
6,7,새로 나온 지옥 시즌 2 찾아서 보여줘,"[-0.02272782102227211, -0.01309523917734623, 0..."
7,8,고수 나오는 가석방 심사관 이한신 정주행 할래,"[-0.020016593858599663, -0.023133916780352592,..."
8,9,어제 본 새벽 2시의 신데렐라 이어서 틀어줘,"[-0.015501613728702068, 0.0005835340707562864,..."
9,10,요즘 인기있는 돌풍 드라마 좀 보자,"[-0.018468040972948074, -0.00851362757384777, ..."


In [8]:
# 3. Find similar texts for a query
MODEL = "gemini-embedding-001"
QUERY = "오징어 있어?"

similar_texts = embed_utils.find_similar_texts(client, MODEL, QUERY, df)

search_results = []
# 4. Print results
for result in similar_texts:
    
    search_results.append({
        "text": result['text'],
        "similarity": f"{result['similarity']:.4f}"
    })

search_results

Latency (ns): 1176.19 ms


[{'text': '오징어 게임 시즌 2 예고편 있어', 'similarity': '0.8384'},
 {'text': '오늘 새로 업데이트된 콘텐츠 뭐 있어', 'similarity': '0.7903'},
 {'text': '오늘의 주우재 가을 코디 영상 볼래', 'similarity': '0.7850'},
 {'text': 'TV야 볼만한 거 추천해 줘', 'similarity': '0.7808'},
 {'text': '오징어 게임 시즌 2 지금 바로 재생해 줘', 'similarity': '0.7763'}]

In [11]:
MODEL = "gemini-2.5-flash-lite"

PROMPT = f"""
    당신은 사용자의 질문을 이해해서 정확한 질문의 의도를 바탕으로 사용자의 질문을 재작성해주는 AI 어시스턴트입니다.
    사용자의 질문 : {QUERY} 과 검색된 유사한 질문들을 참고하여 최대한 사용자의 질문을 반영한 명확한 질문으로 재작성해 주세요.
    유사한 질문들 : {search_results}    

    답변은 아래와 같이 사용자의 질문을 최소화해서 변경 후 재작성 해주세요.
    답변예제 : "최신 개봉 영화 예고편 모음 틀어줘" 
"""
start_time = time.perf_counter_ns()

response = client.models.generate_content(
    model=MODEL,
    contents=PROMPT,
)

end_time = time.perf_counter_ns()

latency = (end_time - start_time)
print(f"{MODEL} Latency (ns): {latency*1e-6:.2f} ms \n")

print(response.text)

gemini-2.5-flash-lite Latency (ns): 1118.77 ms 

"오징어 게임 시즌 2 예고편 보여줘"


### Generate Dataset for Vector Search Index

In [9]:
import json

# read product-embs.json and put them to a list

datapoints = []
for index, item in df.iterrows():
    id = str(item["datapoint_id"])
    metadata = item["text"]
    embedding = item["feature_vector"]

    datapoints.append(
        {
            "datapoint_id": str(id), 
            "feature_vector": embedding,
            "embedding_metadata": 
                {"text": metadata},
        }     
    )

datapoints



[{'datapoint_id': '1',
  'feature_vector': [-0.028963137418031693,
   0.012049556709825993,
   -0.006816131062805653,
   -0.07344743609428406,
   -0.009694629348814487,
   -0.01742573454976082,
   -0.02613888308405876,
   0.012300304137170315,
   0.0022439893800765276,
   0.012580711394548416,
   -0.008340196684002876,
   -0.0017689877422526479,
   -0.012250063940882683,
   0.02446151152253151,
   0.12362102419137955,
   -0.007907714694738388,
   0.004604454152286053,
   0.008389596827328205,
   -0.009573840536177158,
   -0.0024745287373661995,
   -0.02634391002357006,
   -0.0009189480915665627,
   -0.00220539141446352,
   -0.008932769298553467,
   -0.003138359170407057,
   -0.00407725665718317,
   0.021254248917102814,
   0.00808217003941536,
   0.040438391268253326,
   0.007199564483016729,
   -0.013137942180037498,
   0.007316088769584894,
   0.0026240115985274315,
   0.009585423395037651,
   -0.0022709572222083807,
   0.0011993589578196406,
   0.02767503820359707,
   0.019295634701

In [10]:
# Convert the list to a JSON string and write it to a file
with open("vector_search_dataset.json", "w", encoding='utf-8') as jsonl_file:
    
    for item in datapoints:
        json_line = json.dumps(item, ensure_ascii=False)
        print(json_line)
        jsonl_file.write(json_line +'\n')

    # json.dump(datapoints, json_file, ensure_ascii=False, indent=4)


{"datapoint_id": "1", "feature_vector": [-0.028963137418031693, 0.012049556709825993, -0.006816131062805653, -0.07344743609428406, -0.009694629348814487, -0.01742573454976082, -0.02613888308405876, 0.012300304137170315, 0.0022439893800765276, 0.012580711394548416, -0.008340196684002876, -0.0017689877422526479, -0.012250063940882683, 0.02446151152253151, 0.12362102419137955, -0.007907714694738388, 0.004604454152286053, 0.008389596827328205, -0.009573840536177158, -0.0024745287373661995, -0.02634391002357006, -0.0009189480915665627, -0.00220539141446352, -0.008932769298553467, -0.003138359170407057, -0.00407725665718317, 0.021254248917102814, 0.00808217003941536, 0.040438391268253326, 0.007199564483016729, -0.013137942180037498, 0.007316088769584894, 0.0026240115985274315, 0.009585423395037651, -0.0022709572222083807, 0.0011993589578196406, 0.02767503820359707, 0.01929563470184803, 0.009222270920872688, 0.020787851884961128, -0.009488401003181934, -0.005026204977184534, -0.00620021810755

## End of Document