In [None]:
import asyncio
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.messages import HumanMessage
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, SparseVectorParams, VectorParams
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from pi_heif import register_heif_opener
import os
import logging

In [None]:
pdf_path = r"C:\Users\Lenovo\Desktop\rameen\AnnualReport.pdf"
loader = UnstructuredPDFLoader(pdf_path)
pages = loader.load()

In [None]:
print(f"Total pages: {len(pages)}\n")

for page in pages[1:5]:
    print("Metadata:", page.metadata)
    print()
    print(page.page_content[:500], "…\n")

logging.getLogger("pdfminer").setLevel(logging.ERROR)


In [None]:
os.environ["OPENAI_API_KEY"] = ""

In [None]:
page_text = pages[0].page_content

text_splitter = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="percentile", breakpoint_threshold_amount=95.0
)

In [None]:
chunked_docs = text_splitter.create_documents([page_text])

'''for i, doc in enumerate(chunked_docs[:20], start=1):
    print(f"--- Chunk {i} ---")
    print(doc.page_content)
    print()'''

print(f"Total semantic chunks: {len(chunked_docs)}\n")

In [None]:
QDRANT_URL=""
QDRANT_API_KEY=""
os.environ["OPENAI_API_KEY"] = ""
dense_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
qdrant = QdrantVectorStore.from_documents(
    chunked_docs,
    dense_embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID,
    url=QDRANT_URL,
    prefer_grpc=True,
    api_key=QDRANT_API_KEY,
    collection_name="annualreport",
)