In [10]:
#%pip install verbatim-rag

In [11]:
import json
import pandas as pd
import os
from pathlib import Path
from verbatim_rag.document import Document, Chunk, ProcessedChunk, DocumentType, ChunkType
from verbatim_rag.ingestion import DocumentProcessor 
from verbatim_rag.vector_stores import LocalMilvusStore
from verbatim_rag import VerbatimIndex
from verbatim_rag.embedding_providers import SpladeProvider
from collections import defaultdict

In [12]:
documents_for_index = [] 

corpus_path = Path("../../corpus_json/corpus.json")
with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

print(f"Loading {len(corpus)} papers...")

Loading 20 papers...


In [13]:
# checking the corpus
df = pd.DataFrame(corpus)
df.head()

Unnamed: 0,id,title,text
0,2509.20913v1,Deep Learning for Crime Forecasting: The Role ...,Deep Learning for Crime Forecasting: The Role ...
1,2509.23158v1,Deep Learning-Based Detection of Cognitive Imp...,Deep Learning-Based Detection of Cognitive Imp...
2,2510.05163v1,Deep Learning-Based Multi-Factor Authenticatio...,Deep Learning-Based Multi-Factor Authenticatio...
3,2510.05736v1,Convolution and Graph-based Deep Learning Appr...,Convolution and Graph-based Deep Learning Appr...
4,2510.07320v1,Deep Learning Based Approach to Enhanced Recog...,Deep Learning Based Approach to Enhanced Recog...


### Chunking

In [14]:
# replicates the private method '_add_document_metadata' from the repo
def create_enhanced_content(text, doc):
    parts = [text, "", "---"]
    parts.append(f"Document: {doc.title or 'Unknown'}")
    parts.append(f"Source: {doc.source or 'Unknown'}")
    for key, value in doc.metadata.items():
         parts.append(f"{key}: {value}")
    return "\n".join(parts)

In [15]:
# We initialize the processor and use its 'chunker_provider'
processor = DocumentProcessor()

for paper in corpus:
    # Create the shell Document object
    doc_obj = Document(
        title=paper['title'],
        source="json_corpus", 
        content_type=DocumentType.TXT, 
        raw_content=paper['text'],
        metadata={
            "id": paper['id'],
            "title": paper['title']
        }
    )
    # Manually Chunk the text using the processor's tool
    # This breaks the text into semantic pieces
    chunk_tuples = processor.chunker_provider.chunk(paper['text'])

    # Build Chunk objects
    for i, (raw_text, struct_enhanced) in enumerate(chunk_tuples):
        
        # Create the footer/header info
        enhanced_content = create_enhanced_content(struct_enhanced, doc_obj)

        # Create the Basic Chunk
        doc_chunk = Chunk(
            document_id=doc_obj.id,
            content=raw_text,
            chunk_number=i,
            chunk_type=ChunkType.PARAGRAPH,
        )

        # Create the Processed Chunk (The part that gets embedded)
        processed_chunk = ProcessedChunk(
            chunk_id=doc_chunk.id,
            enhanced_content=enhanced_content,
        )

        # Link them
        doc_chunk.add_processed_chunk(processed_chunk)
        doc_obj.add_chunk(doc_chunk)

    documents_for_index.append(doc_obj)

### Building the Index

In [16]:
DB_FILE = "./milvus_final.db"

db_exists = os.path.exists(DB_FILE)

# Setup Store
# we explicitly tell the store we are using Sparse only to save memory
store = LocalMilvusStore(DB_FILE, enable_sparse=True, enable_dense=False)

# we use a standard SPLADE model that works well on CPUs
sparse_embedder = SpladeProvider(
    model_name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
    device="cpu"
)
index = VerbatimIndex(vector_store=store, sparse_provider=sparse_embedder)

if db_exists:
    # The file exists on disk, so we check if Milvus can read it
    print("Database file found.")
    try:
        # We use a valid filter 'id != ""' instead of empty string
        res = store.client.query(store.collection_name, filter='id != ""', limit=1)
        if len(res) > 0:
            print("Index is already populated. SKIPPING ingestion.")
        else:
            print("Database exists but seems empty. Adding documents...")
            index.add_documents(documents_for_index)
    except Exception as e:
        print(f"Database seems corrupted: {e}")
        print("deleting and rebuilding...")
        store.client.drop_collection(store.collection_name)
        index.add_documents(documents_for_index)
else:
    print("New Database. Indexing documents...")
    index.add_documents(documents_for_index)


2025-11-19 21:44:44,516 - INFO - Connected to Milvus Lite: ./milvus_final.db
2025-11-19 21:44:44,579 - INFO - PyTorch version 2.9.1 available.
2025-11-19 21:44:44,932 - INFO - Load pretrained SparseEncoder: opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill
2025-11-19 21:44:47,887 - INFO - Loaded SPLADE model: opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill


Database file found.
Index is already populated. SKIPPING ingestion.


### Query

In [17]:
def find_best_paper(query_text, top_k=5):
    print(f"Querying: '{query_text}'")
    
    results = index.query(query_text, k=top_k)
    
    if not results:
        print("No matches found.")
        return None

    # Dictionary to accumulate REAL scores
    # {'Paper Title': 14.53}
    paper_scores = defaultdict(float)
    
    print(f"\n--- Top {top_k} Chunks & Actual Similarity Scores ---")
    
    for i, res in enumerate(results):
        # 1. Get Metadata (Title)
        meta = getattr(res, 'metadata', {}) or {}
        if not meta and hasattr(res, 'get'): meta = res.get('metadata', {})
        title = meta.get('title', meta.get('id', 'Unknown'))
        id = meta.get('id', 'Unknown')
        
        # 2. EXTRACT THE REAL SCORE
        # We try common attribute names used by Milvus wrappers
        score = getattr(res, 'score', None)
        
        # If .score is missing, sometimes it is called .distance
        if score is None:
            score = getattr(res, 'distance', 0.0)
            
        # 3. Add to Total
        paper_scores[id] += score
        
        # 4. Print Result
        # We print the score to 4 decimal places
        snippet = getattr(res, 'text', getattr(res, 'content', ''))[:40].replace('\n', '')
        print(f"Rank {i+1}: Score {score:.4f} | Paper: {id} [{title[:20]}...] | Text: {snippet}...")

    # 5. Winner
    sorted_papers = sorted(paper_scores.items(), key=lambda x: x[1], reverse=True)
    winner, total_score = sorted_papers[0]
    
    print("\n--- Classification Result ---")
    print(f"Predicted Paper: {winner}")
    print(f"Total Similarity Score: {total_score:.4f}")

    return winner # id of the paper

### Enter Query

In [18]:
query = "How can we detect sarcasm using deep learning?"

#predicted_paper = find_best_paper(query)