In [None]:
import dotenv
import os

from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM


from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PDFMinerLoader
from langchain import LLMChain
from langchain import PromptTemplate

In [None]:
dotenv.load_dotenv()

api_key = os.getenv("API_KEY", None)
project_id = os.getenv("PROJECT_ID", None)

creds = {
    "url"    : "https://us-south.ml.cloud.ibm.com",
    "apikey" : api_key
}

params = {
    GenParams.DECODING_METHOD:"greedy",
    GenParams.MAX_NEW_TOKENS:500, # 150 / 200
    GenParams.MIN_NEW_TOKENS:50, # 30 / 50
    GenParams.REPETITION_PENALTY:1.7
}

chunk_size = 500
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(chunk_size= chunk_size, 
                                               chunk_overlap=chunk_overlap,
                                               separators=["\n\n" 
                                                        #    ,"(?=>\.)"
                                                           ])

In [None]:
# embeddings = HuggingFaceEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(
#             model_name="hkunlp/instructor-large"
#         )
embeddings = HuggingFaceInstructEmbeddings(
            model_name="sentence-transformers/all-MiniLM-l6-v2"
        )

digest the pdf to vectordb

when your document number increase, you would like to do filtering to narrow down the search scope, and you can leverage metadata to do so, firstly, you need to tag your document with corresponding metadata.

reset the connection

In [None]:
dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
print(COLLECTION_NAME)
print(MILVUS_HOST)
print(MILVUS_PORT)

from pymilvus import connections, utility

connections.connect(COLLECTION_NAME, host=MILVUS_HOST, port=MILVUS_PORT)
try:
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
except:
    pass

In [None]:
model = Model(ModelTypes.MT0_XXL,creds,params,project_id)
llm = WatsonxLLM(model)

In [None]:
file = open("category.txt", "r")
categories = file.readlines()

print(len(categories))
print(categories)

In [None]:
from pathlib import Path
from dotenv import load_dotenv
import time

docs = []

for category in categories:
    for path in Path('../../menu/'+category).rglob('*.pdf'):
        loader = PDFMinerLoader('../../menu/'+category+'/'+ path.name)
        data = loader.load()
        for doc in data:
            doc.metadata['product'] = category
        docs += text_splitter.split_documents(data)

# prompt_template = "Summerize the message \"{content}\""

# llm_chain = LLMChain(
#     llm=llm,
#     prompt=PromptTemplate.from_template(prompt_template)
# )

db = Milvus.from_documents(docs,embeddings,
                        collection_name = COLLECTION_NAME,
                        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
                        )

query the vector db

In [None]:
dotenv.load_dotenv()
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)
print(COLLECTION_NAME)
print(MILVUS_HOST)
print(MILVUS_PORT)
db = Milvus(
    embedding_function = embeddings,
    collection_name = COLLECTION_NAME,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    drop_old = False,
)

In [None]:
file = open("sample.txt", "r")
questions = file.readlines()

print(len(questions))
print(questions)

you can do filtering on similarity_search

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


template = """Use the following pieces of context to answer the question at the end. 
{context}
Use three sentences maximum and keep the answer as concise as possible. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(search_kwargs={'k': 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
# result = qa_chain1({"query": "how to clean the refregirator?"})
# result["result"]

In [None]:
import time
import pandas as pd


chain = load_qa_chain(llm, chain_type="stuff")

start = time.time()

data = pd.DataFrame()

questions = [
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] There's noise after installation. Is this normal?
[Agent] What product is it?
[Customer] Dryer
Query: Is it normal for my dryer to make a lot of noise after installation?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] It seems my clothes aren't drying properly.
[Agent] What product are you using?
[Customer] Dryer
Query: What can I do if my clothes aren't drying properly?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] The washing machine isn't working. How can I fix it?
[Agent] Can you elaborate so that I can help with troubleshooting?
[Customer] After each wash cycle, I find that the load is still wet.
Query: How can I fix my washing machine? I have a Samsung washer.""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] How can I set the water temperature?
[Agent] Can you provide the product you are using?
[Customer] Washing Machine
Query: How can I set the water temperature? on the washing machine?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] My step count isn't accurate when I run.
[Agent] What is the product you are using?
[Customer] Galaxy watch6
Query: How do I get my step count to be accurate when I run? I have a Samsung Galaxy watch6.""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] The watch vibration feels too strong.
[Agent] Can you provide the product?
[Customer] Galaxy watch6
Query: How do I turn off the vibration on my Galaxy watch6?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] What should I do if lose the Buds?
[Agent] Can you tell me the model?
[Customer] Galaxy Buds2 Pro
Query: What should I do if lose the Galaxy Buds2 Pro?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] I can't hear the tablet's sound well.
[Agent] Which product are you using?
[Customer] Galaxy Note20
Query: How do I turn up the volume on my Galaxy Note20?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] Can I edit videos taken with the tablet?
[Agent] What is the model?
[Customer] Galaxy Tab A8
Query: Can I edit videos taken with the Galaxy Tab A8?""",
 
"""Dialogue: [Agent] Hello, my name is Lee, how can I help you today?
[Customer] The facial recognition isn't working.
[Agent] What is the product you are using?
[Customer] Galaxy Note20
Query: How do I use facial recognition on my Galaxy Note20?"""]

for query in questions:
    resultdocs = db.similarity_search_with_score(query, k=3)
    indocs = []
    for doc, score in resultdocs:
        print(f"\n{doc}")
        print(f"score:{score}")
        if score < 1:
            indocs += [doc]
    print(f"len: {len(indocs)}")
    answerss = chain.run(input_documents=[], question=query)
    print("\n\nQ:"+query)
    print("\nA(SS):"+answerss)
    result = qa_chain({"query": query})
    answerr = result["result"]
    print("A(R):"+answerr)
    new_record = pd.DataFrame([{'Question':query, 
                                'Answer_Similarity_Search':answerss,
                                'Answer_Retriever':answerr,
                                }])
    data = pd.concat([data,new_record],ignore_index=True)

    # sources = result["source_documents"]
    # for src in sources:
    #     print(src)
    end = time.time()
    
print("Duration: ", end - start, "Count: ", len(questions))

In [None]:
data