In [None]:
#%pip install verbatim-rag

In [13]:
import json
import pandas as pd
from pathlib import Path
from verbatim_rag.document import Document, Chunk, ProcessedChunk, DocumentType, ChunkType
from verbatim_rag.ingestion import DocumentProcessor 
from verbatim_rag.vector_stores import LocalMilvusStore
from verbatim_rag import VerbatimIndex
from verbatim_rag.embedding_providers import SpladeProvider

In [None]:
corpus_path = Path("../../corpus_json/corpus.json")

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

In [15]:
# Convert to DataFrame for easier handling
df = pd.DataFrame(corpus)
df.head()

Unnamed: 0,id,title,text
0,2509.20913v1,Deep Learning for Crime Forecasting: The Role ...,Deep Learning for Crime Forecasting: The Role ...
1,2509.23158v1,Deep Learning-Based Detection of Cognitive Imp...,Deep Learning-Based Detection of Cognitive Imp...
2,2510.05163v1,Deep Learning-Based Multi-Factor Authenticatio...,Deep Learning-Based Multi-Factor Authenticatio...
3,2510.05736v1,Convolution and Graph-based Deep Learning Appr...,Convolution and Graph-based Deep Learning Appr...
4,2510.07320v1,Deep Learning Based Approach to Enhanced Recog...,Deep Learning Based Approach to Enhanced Recog...


### Chunking

In [16]:
# This replicates the private method '_add_document_metadata' from the repo
def create_enhanced_content(text, doc):
    parts = [text, "", "---"]
    parts.append(f"Document: {doc.title or 'Unknown'}")
    parts.append(f"Source: {doc.source or 'Unknown'}")
    for key, value in doc.metadata.items():
         parts.append(f"{key}: {value}")
    return "\n".join(parts)

In [None]:
documents_for_index = []

# We initialize the processor and use its 'chunker_provider'
processor = DocumentProcessor()

for paper in corpus:
    # Create the shell Document object
    doc_obj = Document(
        title=paper['title'],
        source="json_corpus", 
        content_type=DocumentType.TXT, 
        raw_content=paper['text'],
        metadata={
            "id": paper['id'],
            "title": paper['title']
        }
    )
    # Manually Chunk the text using the processor's tool
    # This breaks the text into semantic pieces
    chunk_tuples = processor.chunker_provider.chunk(paper['text'])

    # Build Chunk objects
    for i, (raw_text, struct_enhanced) in enumerate(chunk_tuples):
        
        # Create the footer/header info
        enhanced_content = create_enhanced_content(struct_enhanced, doc_obj)

        # Create the Basic Chunk
        doc_chunk = Chunk(
            document_id=doc_obj.id,
            content=raw_text,
            chunk_number=i,
            chunk_type=ChunkType.PARAGRAPH,
        )

        # Create the Processed Chunk (The part that gets embedded)
        processed_chunk = ProcessedChunk(
            chunk_id=doc_chunk.id,
            enhanced_content=enhanced_content,
        )

        # Link them
        doc_chunk.add_processed_chunk(processed_chunk)
        doc_obj.add_chunk(doc_chunk)

    documents_for_index.append(doc_obj)

### Building the Index

In [None]:
# we explicitly tell the store we are using Sparse only to save memory
store = LocalMilvusStore("./milvus_classifier.db", enable_sparse=True, enable_dense=False)

# we use a standard SPLADE model that works well on CPUs
sparse_embedder = SpladeProvider(
    model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
    device="cpu"
)

index = VerbatimIndex(vector_store=store, sparse_provider=sparse_embedder)

print(f"Adding {len(documents_for_index)} documents to Milvus.")
index.add_documents(documents_for_index)

2025-11-19 15:40:15,934 - INFO - Connected to Milvus Lite: ./milvus_classifier.db
2025-11-19 15:40:15,938 - INFO - Load pretrained SparseEncoder: opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill
2025-11-19 15:40:18,841 - INFO - Loaded SPLADE model: opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill


Adding 20 documents to Milvus.


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  6.69it/s]s]
2025-11-19 15:40:19,150 - INFO - Added 1 vectors to Milvus
2025-11-19 15:40:19,151 - INFO - Added 1 documents to Milvus
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  9.81it/s] 3.31it/s]
2025-11-19 15:40:19,401 - INFO - Added 1 vectors to Milvus
2025-11-19 15:40:19,402 - INFO - Added 1 documents to Milvus
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 10.12it/s] 3.68it/s]
2025-11-19 15:40:19,640 - INFO - Added 1 vectors to Milvus
2025-11-19 15:40:19,641 - INFO - Added 1 documents to Milvus
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  7.26it/s] 3.89it/s]
2025-11-19 15:40:19,923 - INFO - Added 1 vectors to Milvus
2025-11-19 15:40:19,923 - INFO - Added 1 documents to Milvus
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 10.20it/s] 3.74it/s]
2025-11-19 15:40:20,164 - INFO - Added 1 vectors to Milvus
2025-11-19 15:40:20,165 - INFO - Added 1 doc

### Query

In [None]:
from collections import Counter

def find_best_paper(query_text, top_k=5):
    """
    1. Searches for the top_k chunks matching the query.
    2. Counts which paper appears most often.
    3. Returns the winner.
    """
    print(f"ðŸ”Ž Querying: '{query_text}'")
    
    # retrieval
    # We ask for top_k chunks to get a good sample for voting
    results = index.query(query_text, k=top_k)
    
    if not results:
        print("No matches found in the index.")
        return None

    # Extract Votes
    votes = []
    
    print(f"\n--- Raw Retrieved Chunks (Top {top_k}) ---")
    for i, res in enumerate(results):
        # extracting metadata
        meta = getattr(res, 'metadata', None)
        if meta is None and hasattr(res, 'get'):
            meta = res.get('metadata')
        if meta is None: 
            meta = {}

        # get the paper title (Label)
        paper_label = meta.get('title', meta.get('paper_title', meta.get('id', 'Unknown Source')))
        votes.append(paper_label)
        
        if hasattr(res, 'text'):
            raw_text = res.text

        snippet = str(raw_text)[:100].replace('\n', ' ')
        print(f"Rank {i+1}: [{paper_label}] ...{snippet}...")

    # count the votes
    if not votes:
        return "No Metadata Found"

    vote_counts = Counter(votes)
    
    # Determine the Winner
    winner, count = vote_counts.most_common(1)[0]
    
    print("\n--- Classification Result ---")
    print(f"Predicted Paper: {winner}")
    print(f"Confidence: {count}/{len(results)} retrieved chunks belong to this paper.")
    print(f"All candidates: {dict(vote_counts)}")
    
    return winner

### Enter Query

In [26]:
query = "How can we detect sarcasm using deep learning?"

predicted_paper = find_best_paper(query)

ðŸ”Ž Querying: 'How can we detect sarcasm using deep learning?'


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 38.96it/s]


--- Raw Retrieved Chunks (Top 5) ---
Rank 1: [Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning] ...Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone an...
Rank 2: [Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning] ...Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone an...
Rank 3: [Deep Learning-Based Detection of Cognitive Impairment from Passive Smartphone Sensing with Routine-Aware Augmentation and Demographic Personalization] ...Deep Learning-Based Detection of Cognitive Impairment from Passive Smartphone Sensing with Routine-A...
Rank 4: [Deep Learning-Based Detection of Cognitive Impairment from Passive Smartphone Sensing with Routine-Aware Augmentation and Demographic Personalization] ...Deep Learning-Based Detection of Cognitive Impairment from Passive Smartphone Sensing with Routine-A...
Rank 5: [From Detection to


