Feedback to shins777@gmail.com

구글의 BigQuery에는 Vector DB로서의 시멘틱 검색도 가능하지만, 키워드 검색이 가능한 SEARCH 기능이 있습니다.
이 기능을 활용하여 시멘틱서치 + 키워드 서치 형태로  좀더 나은 검색 성능을 기대할수 있습니다.

# 라이브러리 설치

In [None]:
!pip install --upgrade --quiet langchain langchain-google-vertexai google-cloud-aiplatform google-cloud-bigquery

# GCP 인증 및 환경설정

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
PROJECT_ID="PROJECT_ID"
REGION="asia-northeast3"
MODEL = "gemini-pro"

#set and show gcp project
!gcloud config set project {PROJECT_ID}
!gcloud config get-value project

# BigQuery dataset 구성


In [None]:
from google.cloud import bigquery

DATASET = "search_dataset"
TABLE = "search_table"

client = bigquery.Client(project=PROJECT_ID, location=REGION)
client.create_dataset(dataset=DATASET, exists_ok=True)


# Embedding Model & Table 구성

BigQuery SEARCH 기능의 검색대상을 Ebedding 했던 테이블의 Context 부분을 검색하는 목적으로 사용합니다.


In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

EBEDDING_MODEL = "textembedding-gecko-multilingual@latest"

embedding = VertexAIEmbeddings(
    model_name=EBEDDING_MODEL, project=PROJECT_ID
)

In [None]:
from langchain.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import BigQueryVectorSearch

table = BigQueryVectorSearch(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location=REGION,
    embedding=embedding,

    #https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.utils.DistanceStrategy.html#langchain_community.vectorstores.utils.DistanceStrategy
    distance_strategy=DistanceStrategy.COSINE

)

# 테이블에 데이터 저장

In [None]:
import pandas as pd

terms = pd.read_csv('./term1.csv',sep="|", encoding='utf-8-sig')
terms.head()

In [None]:
import json

all_texts = terms['context'].to_list()
#metadatas = [ {'context_title': row['context_title'] } for idx, row in terms.iterrows()]
#table.add_texts(all_texts, metadatas=metadatas)
table.add_texts(all_texts)

# SEARCH 를 위한 Search index 생성

In [None]:

query = f"CREATE SEARCH INDEX search_index ON `{PROJECT_ID}.{DATASET}.{TABLE}`(context);"

print(query)


In [None]:

query = """

SELECT *
FROM `ai-hangsik.search_dataset.search_table`
WHERE SEARCH(context, '검색 대상 키워드');

"""

results = client.query(query)

context ={}

for row in results:
  context[row['doc_id']] = row['content']

context

# Gemini Pro 실행 - BigQuery as a Grounding Service

Responsible AI setting
*   HarmCategory : https://cloud.google.com/vertex-ai/docs/reference/rest/v1/HarmCategory
*   HarmBlockThreshold : https://cloud.google.com/php/docs/reference/cloud-ai-platform/0.31.0/V1.SafetySetting.HarmBlockThreshold

In [None]:
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory

safety_settings = {
                    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE
}

*   VertexAI API : https://api.python.langchain.com/en/stable/llms/langchain_google_vertexai.llms.VertexAI.html#langchain_google_vertexai.llms.VertexAI

In [None]:
from langchain_google_vertexai.llms import VertexAI

gemini_pro = VertexAI( model_name = MODEL,
                  project=PROJECT_ID,
                  location=REGION,
                  verbose=True,
                  streaming=False,
                  safety_settings = safety_settings,
                  temperature = 0.2,
                  top_p = 1,
                  top_k = 40
                 )

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

query = "질문내용을 넣어주세요."

prompt = PromptTemplate.from_template("""

  당신은 법률을 상담하는 AI 어시스턴트입니다.
  아래 Question 에 대해서 반드시 Context에 있는 개별 내용을 기반으로 단계적으로 추론해서 근거를 설명하고 답변해주세요.
  Context : {context}
  Question : {question}

  """)

prompt = prompt.format(context=context,
                       question=query)

print(f"Prompt : {prompt}")
print(f"Answer : {gemini_pro.invoke(prompt)}")
