Using langchain's vectore store plugin for HANA Vector Engine
to store embeddings generated by AI Core.

Prerequisites:
- langchain >= 0.1.4
- generative-ai-hub-sdk 1.2.0
- openAI ada deployment on AI Core

See:<br>
https://pypi.org/project/generative-ai-hub-sdk/<br>
https://github.wdf.sap.corp/AI/generative-ai-hub-sdk/blob/main/docs/gen_ai_hub/examples/gen_ai_hub.ipynb<br>
https://python.langchain.com/docs/integrations/vectorstores/sap_hanavector<br>


In [1]:
import langchain
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

import langchain_community
from langchain_community.document_loaders import TextLoader, PyPDFLoader, PyPDFDirectoryLoader, SitemapLoader
from langchain_community.vectorstores.hanavector import HanaDB

from gen_ai_hub.proxy.langchain.openai import OpenAIEmbeddings
import nest_asyncio

nest_asyncio.apply()

print('langchain version:', langchain.__version__)
print('langchain_community version:', langchain_community.__version__)
# How to get the gen Ai Hub SDK version?

langchain version: 0.1.6
langchain_community version: 0.0.19


In [2]:
# using langchain to read and split the doc

filePath = "./data/PDFs"
# text_documents = TextLoader("data/state_of_the_union.txt").load()
loader = PyPDFDirectoryLoader(filePath)   
documents = loader.load()
#Load document 


text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 10,
        separators=["\n\n", "\n", " ", ""]
    )
text_chunks = text_splitter.split_documents(documents)
print(f"Number of document chunks: {len(text_chunks)}")

# using ai core to embed
embeddings = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002')


Number of document chunks: 2818


In [3]:
# Creating a connection using hana-ml
from hana_ml import ConnectionContext
# cc = ConnectionContext(userkey='VDB_BETA', encrypt=True)
cc= ConnectionContext(
    address='[somehost].hanacloud.ondemand.com', 
    port='443', 
    user='[your user]', 
    password='[your password]',
    encrypt=True
    )
connection = cc.connection

print(cc.hana_version())
print(cc.get_current_schema())

4.00.000.00.1710236938 (fa/CE2024.2)
YAMAHA_USER


In [4]:
# creates a table if not exists
table = "PDF_SAMPLE_"
db = HanaDB(
    embedding=embeddings, connection=connection, table_name=table
)

In [5]:
# Delete already existing documents from the table
# db.delete(filter={})

# add the loaded document chunks
db.add_documents(text_chunks)

[]

In [10]:
# take a look at the table
hdf = cc.sql(''' SELECT "VEC_TEXT", "VEC_META", TO_NVARCHAR("VEC_VECTOR") AS "VEC_VECTOR" FROM "{table}" '''.format(table=table))
df = hdf.head(5).collect()



2818

In [64]:
#query = "How to use Actions in SAP Build Process Automation?"
#query = "What is the process of configuring API triggers?"
#query = "When to use an automation and when to use a process in SBPA?"
#query = "What is the usage of visibility scenarios?"
#query = "Is it possible to use SAP Build Workzone with processes?How?"
#query = "How to use SAPUI5 forms?"
#query = "What are the different roles available to access SAP Build process automation?"
#query = "What are the versions of Desktop Agent?"
#query = "What kind of interactions does the screen record capture?"
#query = "What is the Expression Editor?"
#query = "How to use SAP HANA Vector DB?"

docs = db.similarity_search(query, k=20)

context = ""
for doc in docs:
    #print("-" * 80)
    ##print(doc.page_content)
    context = context + doc.page_content + " "
#print (context)

In [65]:
promptTemplate_fstring = """
You are an Analyzing Given context.
You are provided multiple context items that are related to the prompt you have to answer.
Use the following pieces of context to answer the question at the end. If the query is not part of the given context, reply 'not under my scope'

Context:
{context}

Question:
{query}
"""

In [66]:

from langchain.prompts import PromptTemplate
promptTemplate = PromptTemplate.from_template(promptTemplate_fstring)
 

In [67]:
from gen_ai_hub.proxy.langchain import ChatOpenAI
llm = ChatOpenAI(proxy_model_name='gpt-35-turbo', temperature=0)
prompt = promptTemplate.format(query=query, context=context)
response = llm.predict(prompt)

print (response)

not under my scope
