In [None]:
from langchain_community.document_loaders import ArxivLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
import re, textwrap, pprint, os

In [None]:
loader = ArxivLoader(
    query="Artificial intelligence and machine learning",
    top_k_results=10,
    load_max_docs=20,
)
docs      = loader.load()
summ_docs = loader.get_summaries_as_docs()
print(f"Total docs loaded: {len(docs)}")

In [None]:
print("\n── summaries ──")
for i, summ in enumerate(summ_docs, start=1):
    print(f"\nPaper {i} metadata keys: {list(summ.metadata.keys())}")

for raw_doc, summ in zip(docs, summ_docs):
    url = None

    for v in summ.metadata.values():
        if isinstance(v, str) and re.match(r"^https?://", v):
            url = v
            break

    if url is None and summ.metadata.get("id"):
        url = f"https://arxiv.org/abs/{summ.metadata['id']}"

    raw_doc.metadata["url"] = url or "Unknown"

print("\nExample enriched metadata for doc #1:")
pprint.pprint(docs[0].metadata)

In [None]:
dense_embedder = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda"}
)

sparse_embedder = FastEmbedSparse(model_name="Qdrant/bm25")

In [None]:
splitter = SemanticChunker(
    embeddings=dense_embedder,
    buffer_size=4,
    breakpoint_threshold_type="gradient",
    min_chunk_size=5,
    number_of_chunks=20,
)

In [None]:
all_chunks = []
for doc in docs:
    all_chunks.extend(splitter.split_documents([doc]))
print(f"Total semantic chunks generated: {len(all_chunks)}")

In [None]:
QDRANT_URL=""
QDRANT_API_KEY=""

In [None]:
qdrant_store = QdrantVectorStore.from_documents(
    documents=all_chunks,
    embedding=dense_embedder,
    sparse_embedding=sparse_embedder,
    retrieval_mode=RetrievalMode.HYBRID,
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=True,
    collection_name="axRiv_research_papers",
)
print(f"Upserted {len(all_chunks)} chunks into Qdrant collection 'axRiv_research_papers'.")
