In [None]:
# https://python.langchain.com/docs/tutorials/retrievers/
import os
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http import models

In [None]:
OLLAMA_SERVER = os.getenv("OLLAMA_SERVER")

In [None]:
# 1. We load the document
# -----------------------
file_path = "./docs/test.pdf"
loader    = PyPDFLoader(file_path)
docs      = loader.load()
print(len(docs))

In [None]:
print(f"{docs[0].page_content[:100]}\n")
docs[0].metadata

In [None]:
# We split this document un chunks
# --------------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10, add_start_index=True)
all_splits    = text_splitter.split_documents(docs)
len(all_splits)

In [None]:
# We select the embedder and create vectors
# ------------------------------------------
embeddings = OllamaEmbeddings(base_url=OLLAMA_SERVER, model="mxbai-embed-large")
vectors    = [embeddings.embed_query(text.page_content) for text in all_splits]

In [None]:
# Instantiate the QDrant client and recreate a collection
#--------------------------------------------------------
qdrant_client     = QdrantClient(host='lawboxai_qdrant')
qdrant_collection = 'example'
qdrant_client.delete_collection(collection_name=qdrant_collection)

In [None]:
qdrant_client.create_collection(
   collection_name=qdrant_collection,
   vectors_config=models.VectorParams(size=len(vectors[0]), distance=models.Distance.COSINE),
)
vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=qdrant_collection,
    embedding=embeddings
)
vector_store.add_documents(all_splits)
query = "Les défauts de payement"

In [None]:
# Similarity_search: Returns the top documents based solely on vector similarity
docs  = vector_store.similarity_search(query, k=10,)
for doc in docs:
    print(50*'-')
    print(doc.page_content)

In [None]:
# Similar to similarity_search, but also returns relevance scores
docs  = await vector_store.asimilarity_search_with_relevance_scores(query,k=5)   #.similarity_search(query, k=10,)
for doc in docs:
    print(50*'-',doc[1])
    print(doc[0].page_content)

In [None]:
# The LangChain way, using Retriever
retriever = vector_store.as_retriever()
query = "Les défauts de payement"
retrieved_docs = retriever.invoke(query,k=7)

# Afficher les résultats
for doc in retrieved_docs:
    print(50*'-')
    print(doc.page_content)