In [62]:
import os
import time

from pymilvus import connections
from pymilvus import FieldSchema
from pymilvus import CollectionSchema
from pymilvus import DataType
from pymilvus import Collection
from pymilvus import utility

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

from dotenv import load_dotenv
import genai.extensions.langchain
from genai.extensions.langchain import LangChainInterface
from genai.schemas import GenerateParams
from genai import Credentials
from genai import Model
from genai import PromptPattern

from langchain.vectorstores import FAISS

In [None]:
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_endpoint = os.getenv("GENAI_API", None)
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
EMBEDDING_DIMENSION = os.getenv("EMBEDDING_DIMENSION", None)
EMBEDDING_COUNT = os.getenv("EMBEDDING_COUNT", None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)

In [64]:
chunk_size = 1000
chunk_overlap = 150
separator = "\n"

r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=separator)

In [37]:
COLLECTION_NAME = "menu_db"
DIMENSION = 768
COUNT = 11127
MILVUS_HOST = "52.116.143.197"
MILVUS_PORT = "19530"

In [38]:
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
creds = Credentials(api_key)
params = GenerateParams(
    decoding_method = "greedy"
)

In [57]:
connections.connect(host = MILVUS_HOST, port = MILVUS_PORT)

# if utility.has_collection(COLLECTION_NAME):
#    utility.drop_collection(COLLECTION_NAME)

fields = [
    FieldSchema(name = "id", dtype = DataType.INT64, description = "Ids", is_primary = True, auto_id = False),
    FieldSchema(name = "menu", dtype = DataType.VARCHAR, description = "menu texts", max_length = 3000),
    FieldSchema(name = "embedding", dtype = DataType.FLOAT_VECTOR, description = "Embedding vectors", dim = DIMENSION)
]
schema = CollectionSchema(fields = fields, description = "menu collection")
collection = Collection(name = COLLECTION_NAME, schema = schema)

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 1024}
}
collection.create_index(field_name = "embedding", index_params = index_params)

Status(code=0, message=)

In [58]:
embeddings = HuggingFaceEmbeddings()

In [72]:
def loadPDF(filename):
    loader = PyPDFLoader(filename)
    pages = loader.load()

    len(pages)

    # page= pages[10]

    # page.metadata

    # print(page.page_content[:500])

    docs = r_splitter.split_documents(pages)

    print(docs[1].page_content)
    print(docs[1].metadata)
    len(docs)
    return docs

In [73]:
def storeToMilvus(docs):
    start = time.time()
    data = [[], [], []]
    if docs:
        for idx, text in enumerate(docs):
            data[0].append(idx)
            data[1].append(text.page_content)
            if len(text.page_content) > 768:
                tt = text.page_content[:766] + ".."
                data[2].append(embeddings.embed_query(tt))
            else:
                data[2].append(embeddings.embed_query(text.page_content))

    collection.insert(data)
    end = time.time()
    print("Duration: ", end - start)

In [74]:
from pathlib import Path

for path in Path('menu').rglob('*.pdf'):
    print(os.curdir + '/menu/'+ path.name)
    docs = loadPDF(os.curdir + '/menu/'+ path.name)
    storeToMilvus(docs)

# print("Number of entities: ", collection.num_entities)

./menu/TMO_SM-N970U_SM-N975U_EN_UM_P_9.0_081219_FINAL.pdf
Table ofcontents
Features 1
S Pen 1
Wireless PowerShare 1
Meet Bixby 1
Camera 1
Mobile continuity 1
Cinematic infinity display 1
Getting started 2
Galaxy Note10 3
Galaxy Note10+ 4
Assemble your device 5
Charge the battery 6
Wireless PowerShare 7
Accessories 8
Start using your device 8
Use the Setup Wizard 8
Transfer data from an old device 9
Lock or unlock your device 10
i TMO_N970U_N975U_EN_UM_TN_SGO_081219_FINAL
{'source': './menu/TMO_SM-N970U_SM-N975U_EN_UM_P_9.0_081219_FINAL.pdf', 'page': 1}
Duration:  22.229619026184082
./menu/SAM_G991U_G996U_G998U_EN_UM_R_11.0_011421_FINAL_AC.pdf
2 Contents 
Features 
S Pen | Mobile continuity | Bixby | Biometric secur ity | Dark mode 
Getting started 
Device layout: Galaxy S21 5G | Galaxy S21+ 5G | Galaxy S21 Ultr a 5G 
Set up your device: Charge the battery | Wireless power
 sharing 
Start using your device: Turn on your device | Use the Setup Wi
zard | Transfer data f
rom 
an old device

#searching

In [77]:
collection.load()

def search(text, embeddings):
    results = collection.search(
        data = [embeddings.embed_query(text)],
        anns_field = "embedding",
        param = {"metric_type": "L2"},
        limit = 5,
        output_fields = ["menu"]
    )

    ret = []
    for hit in results[0]:
        row = []
        row.extend([hit.id, hit.score, hit.entity.get("menu")])
        ret.append(row)
    return ret

search_terms = ["samsung", "galaxy", "camera","fold","5G"]
for x in search_terms:
    print("Searching for term:", x)
    s1 = time.time()
    for result in search(x, embeddings):
        print(result)
    e1 = time.time()
    print("Search time: ", e1 - s1)

Searching for term: samsung
[324, 0.7903430461883545, 'to ensure it is working properly and is safe for children to use. \nSamsung Electronics America, Inc. \nAddress: \n85 Challenger Road \nRidgefield Park \nNew Jersey 07660 \nPhone: 1-800-SAMSUNG (726-7864) \nInternet: samsung.com \n©2021 Samsung Electronics America, Inc. Samsung, Samsung Galaxy, Multi Window, \nS Pen, Samsung Health, Samsung Pay, and Smart Switch are all trademarks of \nSamsung Electronics Co., Ltd. \nOther marks are the property of their respective owners. \nManufactured under license from Dolby Laboratories. Dolby, Dolby Atmos, and the \ndouble-D symbol are trademarks of Dolby Laboratories.']
[134, 0.8046634197235107, 'Internet\nSamsungInternetisasimple,fast,andreliablewebbrowserforyourdevice.\nExperiencemoresecureWebbrowsingfeatureswithSecretMode,BiometricWeb\nLogin,andContentsBlocker.\nAccess BookmarksR\neload webpage Add to Bookmarks\nNavigateAccess Tabs\nTIP\xa0 \xa0Visitsamsung.com/us/support/owners/app/samsu