In [None]:
import dotenv
import os

from genai.model import GenerateParams
from genai import Credentials
from genai.extensions.langchain import LangChainInterface

from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PDFMinerLoader
from langchain import LLMChain
from langchain import PromptTemplate

In [None]:
dotenv.load_dotenv()

api_key = os.getenv("GENAI_KEY",None)
api_endpoint = os.getenv("GENAI_API",None)

creds = Credentials(api_key,api_endpoint)

params = GenerateParams(
    decoding_method="greedy",
    temperature=0.05,
    max_new_tokens=300,
    min_new_tokens=15,
    repetition_penalty=2,
)

chunk_size = 500
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, 
                                               chunk_overlap=chunk_overlap,
                                               separators=["\n\n" ,"(?=>\.)"
                                                           ])

In [None]:
# embeddings = HuggingFaceEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(
#             model_name="hkunlp/instructor-large"
#         )
embeddings = HuggingFaceInstructEmbeddings(
            model_name="sentence-transformers/all-MiniLM-l6-v2"
        )

digest the pdf to vectordb

when your document number increase, you would like to do filtering to narrow down the search scope, and you can leverage metadata to do so, firstly, you need to tag your document with corresponding metadata.

reset the connection

In [None]:
dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
print(COLLECTION_NAME)
print(MILVUS_HOST)
print(MILVUS_PORT)

from pymilvus import connections, utility

connections.connect(COLLECTION_NAME, host=MILVUS_HOST, port=MILVUS_PORT)
try:
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
except:
    pass

In [None]:
llm = LangChainInterface(model="bigscience/mt0-xxl",credentials=creds,params=params)

In [None]:
file = open("category.txt", "r")
categories = file.readlines()

print(len(categories))
print(categories)

In [None]:
from pathlib import Path
from dotenv import load_dotenv
import time

docs = []

for category in categories:
    for path in Path('../../menu/'+category).rglob('*.pdf'):
        loader = PDFMinerLoader('../../menu/'+category+'/'+ path.name)
        data = loader.load()
        for doc in data:
            doc.metadata['product'] = category
        docs += text_splitter.split_documents(data)

# prompt_template = "Summerize the message \"{content}\""

# llm_chain = LLMChain(
#     llm=llm,
#     prompt=PromptTemplate.from_template(prompt_template)
# )

db = Milvus.from_documents(docs,embeddings,
                        collection_name = COLLECTION_NAME,
                        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
                        )

query the vector db

In [None]:
dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
print(COLLECTION_NAME)
print(MILVUS_HOST)
print(MILVUS_PORT)
db = Milvus(
    embedding_function = embeddings,
    collection_name = COLLECTION_NAME,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    drop_old = False,
)

In [None]:
file = open("sample.txt", "r")
questions = file.readlines()

print(len(questions))
print(questions)

you can do filtering on similarity_search

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
# result = qa_chain1({"query": "how to clean the refregirator?"})
# result["result"]

In [None]:
import time
import pandas as pd


chain = load_qa_chain(llm, chain_type="stuff")

start = time.time()

data = pd.DataFrame()

for query in questions:
    print("Q:"+query)
    resultdocs = db.similarity_search(query, k=3)
    for doc in resultdocs:
        print(doc)
    answerss = chain.run(input_documents=resultdocs, question=query)
    print("A(SS):"+answerss)
    result = qa_chain({"query": query})
    answerr = result["result"]
    print("A(R):"+answerr)
    new_record = pd.DataFrame([{'Question':query, 
                                'Answer_Similarity_Search':answerss,
                                'Answer_Retriever':answerr,
                                }])
    data = pd.concat([data,new_record],ignore_index=True)

    # sources = result["source_documents"]
    # for src in sources:
    #     print(src)
    end = time.time()
    
print("Duration: ", end - start, "Count: ", len(questions))

In [None]:
data