In [1]:
import os
import time

from pymilvus import connections
from pymilvus import FieldSchema
from pymilvus import CollectionSchema
from pymilvus import DataType
from pymilvus import Collection
from pymilvus import utility

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

from dotenv import load_dotenv
import genai.extensions.langchain
from genai.extensions.langchain import LangChainInterface
from genai.schemas import GenerateParams
from genai import Credentials
from genai import Model
from genai import PromptPattern

from langchain.vectorstores import FAISS

In [2]:
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_endpoint = os.getenv("GENAI_API", None)
COLLECTION_NAME = os.getenv("COLLECTION_NAME", None)
EMBEDDING_DIMENSION = os.getenv("EMBEDDING_DIMENSION", None)
EMBEDDING_COUNT = os.getenv("EMBEDDING_COUNT", None)
EMBEDDING_MAX = os.getenv("EMBEDDING_MAX",None)
MILVUS_HOST = os.getenv("MILVUS_HOST", None)
MILVUS_PORT = os.getenv("MILVUS_PORT", None)

In [3]:
chunk_size = 1000
chunk_overlap = 150
separator = "\n"

r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=separator)

In [4]:
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
creds = Credentials(api_key)
params = GenerateParams(
    decoding_method = "greedy"
)

In [6]:
connections.connect(host = MILVUS_HOST, port = MILVUS_PORT)

if utility.has_collection(COLLECTION_NAME):
   utility.drop_collection(COLLECTION_NAME)

fields = [
    FieldSchema(name = "id", dtype = DataType.INT64, description = "Ids", is_primary = True, auto_id = False),
    FieldSchema(name = "content", dtype = DataType.VARCHAR, description = "Content texts", max_length = EMBEDDING_MAX),
    FieldSchema(name = "embedding", dtype = DataType.FLOAT_VECTOR, description = "Embedding vectors", dim = EMBEDDING_DIMENSION)
]
schema = CollectionSchema(fields = fields, description = "content collection")
collection = Collection(name = COLLECTION_NAME, schema = schema)

index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 1024}
}
collection.create_index(field_name = "embedding", index_params = index_params)

Status(code=0, message=)

In [7]:
embeddings = HuggingFaceEmbeddings()

In [8]:
def loadPDF(filename):
    loader = PyPDFLoader(filename)
    pages = loader.load()

    len(pages)

    # page= pages[10]

    # page.metadata

    # print(page.page_content[:500])

    docs = r_splitter.split_documents(pages)

    print(docs[1].page_content)
    print(docs[1].metadata)
    len(docs)
    return docs

In [9]:
def storeToMilvus(docs):
    start = time.time()
    data = [[], [], []]
    if docs:
        for idx, text in enumerate(docs):
            data[0].append(idx)
            data[1].append(text.page_content)
            if len(text.page_content) > 768:
                tt = text.page_content[:766] + ".."
                data[2].append(embeddings.embed_query(tt))
            else:
                data[2].append(embeddings.embed_query(text.page_content))

    collection.insert(data)
    end = time.time()
    print("Duration: ", end - start)

In [10]:
from pathlib import Path

for path in Path('content').rglob('*.pdf'):
    print(os.curdir + '/content/'+ path.name)
    docs = loadPDF(os.curdir + '/content/'+ path.name)
    storeToMilvus(docs)

# print("Number of entities: ", collection.num_entities)

#searching

In [11]:
collection.load()

def search(text, embeddings):
    results = collection.search(
        data = [embeddings.embed_query(text)],
        anns_field = "embedding",
        param = {"metric_type": "L2"},
        limit = 5,
        output_fields = ["content"]
    )

    ret = []
    for hit in results[0]:
        row = []
        row.extend([hit.id, hit.score, hit.entity.get("content")])
        ret.append(row)
    return ret

search_terms = ["camera", "phone", "5G"]
for x in search_terms:
    print("Searching for term:", x)
    s1 = time.time()
    for result in search(x, embeddings):
        print(result)
    e1 = time.time()
    print("Search time: ", e1 - s1)

Searching for term: camera
Search time:  0.2746272087097168
Searching for term: phone
Search time:  0.2411642074584961
Searching for term: 5G
Search time:  0.2411508560180664
