Copyright 2024 Forusone : shins777@gmail.com

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## BigQuery SEARCH Function for Keyword search

* Google's BigQuery has a SEARCH feature that allows vector DB integrated semantic search but also keyword searching.
* By utilizing this feature, you can expect excellent search performance in the form of semantic search + keywords.

* Reference :
  * https://cloud.google.com/bigquery/docs/search-intro?hl=ko
  * https://cloud.google.com/bigquery/docs/search?hl=ko

In [None]:
%pip install --upgrade --quiet langchain-google-vertexai google-cloud-aiplatform google-cloud-bigquery

### GCP 사용자 인증 / 환경설정

GCP 인증방법은 아래와 URL 정보를 참고하여 GCP에 접근 하는 환경을 구성해야 합니다.
* https://cloud.google.com/docs/authentication?hl=ko
* 자세한 정보는 [README.md](https://github.com/shins777/google_gen_ai_sample/blob/main/notebook/gemini/README.md) 파일 참고하세요.

In [None]:
#  아래 코드는 Colab 환경에서만 실행해주세요. 다른 환경에서는 동작하지 않습니다.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

### GCP 프로젝트 및 리전 설정
본인의 GCP 환경에 맞게 아래 설정을 구성하세요.  
* 구글의 최신버전인 gemini pro 사용을 권고드립니다.   
* 만일, 기본 버전 text bison 을 사용하려한다면, 참조하는 class 가 다르므로 주의하세요.  
* 현재 Gemini는 한국리전(asia-northeast3)을 통해서 접근이 가능합니다.
* 아래 SEARCH_URL Vertex AI Search를 구성 후 Integration 메뉴항목에서 추출된 값입니다.

In [None]:
PROJECT_ID="PROJECT_ID"
REGION="asia-northeast3"
MODEL = "gemini-pro"

#set and show gcp project
!gcloud config set project {PROJECT_ID}
!gcloud config get-value project

### BigQuery dataset 구성


In [None]:
from google.cloud import bigquery

DATASET = "search_dataset"
TABLE = "search_table"

client = bigquery.Client(project=PROJECT_ID, location=REGION)
client.create_dataset(dataset=DATASET, exists_ok=True)

### Embedding Model & Table 구성

BigQuery SEARCH 기능의 검색대상을 Ebedding 했던 테이블의 Context 부분을 검색하는 목적으로 사용합니다.


In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

EBEDDING_MODEL = "textembedding-gecko-multilingual@latest"

embedding = VertexAIEmbeddings(
    model_name=EBEDDING_MODEL, project=PROJECT_ID
)

In [None]:
from langchain.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import BigQueryVectorSearch

table = BigQueryVectorSearch(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location=REGION,
    embedding=embedding,

    #https://api.python.langchain.com/en/stable/vectorstores/langchain_community.vectorstores.utils.DistanceStrategy.html#langchain_community.vectorstores.utils.DistanceStrategy
    distance_strategy=DistanceStrategy.COSINE

)

### 테이블에 데이터 저장

In [None]:
import pandas as pd

terms = pd.read_csv('./term1.csv',sep="|", encoding='utf-8-sig')
terms.head()

In [None]:
import json

all_texts = terms['context'].to_list()
#metadatas = [ {'context_title': row['context_title'] } for idx, row in terms.iterrows()]
#table.add_texts(all_texts, metadatas=metadatas)
table.add_texts(all_texts)

### SEARCH 를 위한 Search index 생성

In [None]:
query = f"CREATE SEARCH INDEX search_index ON `{PROJECT_ID}.{DATASET}.{TABLE}`(context);"
print(query)

In [None]:
query = """
SELECT *
FROM `ai-hangsik.search_dataset.search_table`
WHERE SEARCH(context, '검색 대상 키워드');

"""

results = client.query(query)
context ={}

for row in results:
  context[row['doc_id']] = row['content']

context

### Gemini Pro 실행 - BigQuery as a Grounding Service

*   VertexAI API : https://api.python.langchain.com/en/stable/llms/langchain_google_vertexai.llms.VertexAI.html#langchain_google_vertexai.llms.VertexAI

In [None]:
from langchain_google_vertexai.llms import VertexAI

gemini_pro = VertexAI( model_name = MODEL,
                  project=PROJECT_ID,
                  location=REGION,
                  verbose=True,
                  streaming=False,
                  temperature = 0.2,
                  top_p = 1,
                  top_k = 40
                 )

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

query = "질문내용을 넣어주세요."

prompt = PromptTemplate.from_template("""

  당신은 법률을 상담하는 AI 어시스턴트입니다.
  아래 Question 에 대해서 반드시 Context에 있는 개별 내용을 기반으로 단계적으로 추론해서 근거를 설명하고 답변해주세요.
  Context : {context}
  Question : {question}

  """)

prompt = prompt.format(context=context,
                       question=query)

print(f"Prompt : {prompt}")
print(f"Answer : {gemini_pro.invoke(prompt)}")
