In [None]:
import dotenv
import os

from genai.model import GenerateParams
from genai import Credentials
from genai.extensions.langchain import LangChainInterface

from langchain.document_loaders.generic import GenericLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PyPDFLoader

In [None]:
chunk_size = 2000
chunk_overlap = 100

In [None]:
dotenv.load_dotenv()

api_key = os.getenv("GENAI_KEY",None)
api_endpoint = os.getenv("GENAI_API",None)

creds = Credentials(api_key,api_endpoint)

params = GenerateParams(
    decoding_method="greedy",
    max_new_tokens=300,
    min_new_tokens=15,
    repetition_penalty=2,
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, chunk_overlap=chunk_overlap)

In [None]:
llm = LangChainInterface(model="meta-llama/llama-2-13b",credentials=creds,params=params)

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-large"
        )

digest the pdf to vectordb

In [None]:
!pwd

In [None]:
from pathlib import Path

docs = []
products = ['apple','orange','banana']

for product in products:
    for path in Path('../../menu/'+product).rglob('*.pdf'):
        loader = PyPDFLoader('../../menu/'+product+'/'+ path.name)
        data = loader.load()
        for doc in data:
            doc.metadata['product'] = product
        docs += text_splitter.split_documents(data)

        # for doc in docs:
        #     print(doc)

dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
DIMENSION = os.getenv("DIMENSION", None)
COUNT = os.getenv("COUNT", None)
MAX = os.getenv("MAX",None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
db = Milvus.from_documents(docs,embeddings,
                        collection_name = COLLECTION_NAME,
                        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
                        )

query the vector db

In [None]:
dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
DIMENSION = os.getenv("DIMENSION", None)
COUNT = os.getenv("COUNT", None)
MAX = os.getenv("MAX",None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
db = Milvus(
    embedding_function = embeddings,
    collection_name = COLLECTION_NAME,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    drop_old = False,
)

In [None]:
file = open("sample.txt", "r")
questions = file.readlines()

print(questions)

In [None]:
chain = load_qa_chain(llm, chain_type="stuff")

start = time.time()

product = "apple"

for query in questions:
    print("Q:"+query)
    resultdocs = db.similarity_search(query, k=3, filter={'product': product})
    # for res in resultdocs:
    #     print(res)
    answer = chain.run(input_documents=resultdocs, question=query)
    print("A:"+answer)
    end = time.time()

print("Duration: ", end - start, "Count: ", len(questions))