In [1]:
import os
from qdrant_client import QdrantClient
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http.models import Distance, VectorParams
from langchain_cohere import CohereEmbeddings
from langchain.schema import Document

In [2]:
from dotenv import load_dotenv
load_dotenv()

In [84]:
# read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [None]:
doc=read_doc('documents/')
len(doc)

In [86]:
# Divide the docs into chunks
 def chunk_data(docs,chunk_size=800,chunk_overlap=50):
     text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
     doc=text_splitter.split_documents(docs)
     return doc

In [3]:
 documents=chunk_data(docs=doc)
 documents


In [6]:
embeddings_model = CohereEmbeddings(cohere_api_key=os.environ['COHERE_API_KEY'], model='embed-english-v3.0')
embeddings_model

In [9]:
# embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
# len(embedded_query)
# Generate embeddings for each user profile
#embeddings = embeddings_model.embed_documents([doc.page_content for doc in documents])

In [7]:
qdrant_client = QdrantClient(
    os.environ['QDRANT_HOST'],
    api_key=os.environ['QDRANTDB_API_KEY'],
)

print(qdrant_client.get_collections())

In [8]:
vector_config = VectorParams(
    size=1024,
    distance=Distance.COSINE
)

qdrant_client.create_collection(
    collection_name=os.environ['COLLECTION_NAME'],
    vectors_config=vector_config
)

In [9]:
from uuid import uuid4


vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=os.environ['COLLECTION_NAME'],
    embedding=embeddings_model,
)

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(
    documents=documents, 
    ids=uuids
)

In [None]:
query = 'what is loadbalancer'results = vector_store.similarity_search_with_score(query, k=5)
results