In [1]:
import sys
from pathlib import Path

In [2]:
project_root = Path.cwd().parent  # Assuming notebook is in project_root/notebooks/
sys.path.append(str(project_root))

In [3]:
print(f"Added {project_root} to Python path")
print("\nCurrent Python path:")
for p in sys.path:
    print(f"- {p}")


Added /Users/robinsingh/Desktop/LLM_GEN_AI /api-docs-ai/retrieval_service to Python path

Current Python path:
- /Users/robinsingh/miniconda3/envs/nextjs/lib/python39.zip
- /Users/robinsingh/miniconda3/envs/nextjs/lib/python3.9
- /Users/robinsingh/miniconda3/envs/nextjs/lib/python3.9/lib-dynload
- 
- /Users/robinsingh/miniconda3/envs/nextjs/lib/python3.9/site-packages
- /Users/robinsingh/Desktop/LLM_GEN_AI /api-docs-ai/retrieval_service


In [3]:
from app.core.enums import  DocSource

In [21]:
from app.retrieval.embeddings.dense import DenseEmbedder  # Adjust the import path as needed

In [4]:
from app.retrieval.base import RetrievalPipeline

Using device: mps


In [5]:
# Second cell - Test imports
try:
    from app.core.enums import DocSource
    from app.retrieval.base import RetrievalPipeline
    print("\nImports successful!")
except ImportError as e:
    print(f"\nImport error: {str(e)}")
    print("\nMake sure your directory structure is correct:")
    print("""
    project_root/
    ├── app/
    │   ├── __init__.py
    │   ├── core/
    │   │   ├── __init__.py
    │   │   └── enums.py
    │   └── retrieval/
    │       ├── __init__.py
    │       └── base.py
    └── notebooks/
        └── test_pipeline.ipynb
    """)


Imports successful!


In [7]:
# Third cell - Initialize pipeline
pipeline = RetrievalPipeline()
print("Pipeline initialized successfully!")

Pipeline initialized successfully!


In [8]:
# Fourth cell - Process Flutter docs
print("Processing Flutter documentation...")
pipeline.process_documents(DocSource.FLUTTER)

Processing Flutter documentation...
2025-02-11 19:57:19 | INFO     | retrieval_pipeline:process_documents:34 - Processing documents for flutter
2025-02-11 19:57:19 | INFO     | retrieval_pipeline:clear_data:137 - Cleared all data for flutter
2025-02-11 19:57:19 | INFO     | retrieval_pipeline:fetch_sitemap:24 - Fetching sitemap from https://docs.flutter.dev/sitemap.xml
2025-02-11 19:57:19 | INFO     | retrieval_pipeline:fetch_sitemap:31 - Found 564 URLs in sitemap


Fetching URLs: 100%|██████████| 564/564 [03:32<00:00,  2.65it/s]


2025-02-11 20:00:52 | INFO     | retrieval_pipeline:fetch_all_contents:76 - Successfully fetched 564 out of 564 URLs
2025-02-11 20:00:57 | INFO     | retrieval_pipeline:chunk_documents:56 - Generated 5815 chunks from 564 documents
2025-02-11 20:00:57 | INFO     | retrieval_pipeline:embed_texts:29 - Generating dense embeddings for 5815 texts


Generating dense embeddings: 100%|██████████| 5815/5815 [12:06<00:00,  8.00it/s]


2025-02-11 20:13:04 | INFO     | retrieval_pipeline:build_index:60 - Building FAISS index...
2025-02-11 20:13:04 | INFO     | retrieval_pipeline:build_index:63 - FAISS index built successfully
2025-02-11 20:13:04 | INFO     | retrieval_pipeline:embed_texts:24 - Generating sparse embeddings for 5815 texts


Generating sparse embeddings: 100%|██████████| 5815/5815 [16:35<00:00,  5.84it/s]


2025-02-11 20:29:39 | INFO     | retrieval_pipeline:save_data:36 - Saving data for flutter to data/flutter_docs
2025-02-11 20:29:42 | INFO     | retrieval_pipeline:save_data:56 - Successfully saved all data for flutter
2025-02-11 20:29:42 | INFO     | retrieval_pipeline:process_documents:73 - Successfully processed documents for flutter


In [9]:
pipeline.load_source(DocSource.FLUTTER)
stats = pipeline.get_source_stats()
print("\nFlutter Documentation Stats:")
print(f"- Total chunks: {stats['total_chunks']}")
print(f"- Total URLs: {stats['total_urls']}")
print(f"- Device: {stats['device']}")

2025-02-11 20:30:03 | INFO     | retrieval_pipeline:load_source:87 - Loading data for flutter
2025-02-11 20:30:03 | INFO     | retrieval_pipeline:load_data:66 - Loading data for flutter from data/flutter_docs
2025-02-11 20:30:03 | INFO     | retrieval_pipeline:load_data:100 - Successfully loaded all data for flutter
2025-02-11 20:30:03 | INFO     | retrieval_pipeline:load_source:97 - Successfully loaded data for flutter

Flutter Documentation Stats:
- Total chunks: 5815
- Total URLs: 564
- Device: mps


In [10]:
# Fifth cell - Test search
def test_search(query: str):
    print(f"Searching for: {query}\n")
    results = pipeline.search_documents(query)
    
    print(f"Status: {results['status']}")
    print(f"Confidence: {results['confidence']}\n")
    
    if results['status'] == 'success':
        for i, result in enumerate(results['results'][:3], 1):
            print(f"Result {i}:")
            print(f"URL: {result['url']}")
            print(f"Text: {result['text'][:200]}...")
            print(f"Scores: {result['scores']}\n")

In [11]:
test_search("How to create a StatefulWidget?")


Searching for: How to create a StatefulWidget?

2025-02-11 20:30:43 | ERROR    | retrieval_pipeline:search:108 - Error in hybrid search: 'HybridSearch' object has no attribute 'rerank'
2025-02-11 20:30:43 | ERROR    | retrieval_pipeline:search_documents:151 - Error searching documents: 'HybridSearch' object has no attribute 'rerank'


AttributeError: 'HybridSearch' object has no attribute 'rerank'

In [12]:
# First, clear any existing instances (if you've run the code before)
from app.core.singleton import Singleton
Singleton._instances = {}

In [5]:
# Then run your test
def test_search(query: str):
    print(f"Searching for: {query}\n")
    results = pipeline.search_documents(query)
    
    print(f"Status: {results['status']}")
    print(f"Confidence: {results['confidence']}\n")
    
    if results['status'] == 'success':
        for i, result in enumerate(results['results'][:3], 1):
            print(f"Result {i}:")
            print(f"URL: {result['url']}")
            print(f"Text: {result['text'][:200]}...")
            print(f"Scores: {result['scores']}\n")


In [6]:
# Initialize pipeline
from app.core.enums import DocSource
from app.retrieval.base import RetrievalPipeline

Using device: mps


In [7]:
pipeline = RetrievalPipeline()
pipeline.load_source(DocSource.FLUTTER)

2025-02-11 20:39:58 | INFO     | retrieval_pipeline:__init__:19 - Initializing dense embedder with model BAAI/bge-small-en-v1.5 on mps
2025-02-11 20:39:58 | INFO     | retrieval_pipeline:__init__:15 - Initializing sparse embedder with model prithivida/Splade_PP_en_v1
2025-02-11 20:39:59 | INFO     | retrieval_pipeline:__init__:14 - Initializing reranker with model Xenova/ms-marco-MiniLM-L-6-v2
2025-02-11 20:39:59 | INFO     | retrieval_pipeline:load_source:88 - Loading data for flutter
2025-02-11 20:39:59 | INFO     | retrieval_pipeline:load_data:66 - Loading data for flutter from data/flutter_docs
2025-02-11 20:39:59 | INFO     | retrieval_pipeline:load_data:100 - Successfully loaded all data for flutter
2025-02-11 20:39:59 | INFO     | retrieval_pipeline:load_source:98 - Successfully loaded data for flutter


In [8]:
test_search("How to create a StatefulWidget?")


Searching for: How to create a StatefulWidget?

2025-02-11 20:40:02 | INFO     | retrieval_pipeline:rerank:23 - Reranking 20 texts
2025-02-11 20:40:04 | INFO     | retrieval_pipeline:check_relevance:90 - Relevance metrics - Similarity: 0.824, Term overlap: 1.000
Status: success
Confidence: 0.82

Result 1:
URL: https://docs.flutter.dev/cookbook/animation/opacity-animation
Text: Fade a widget in and out | Flutterdocs.flutter.dev uses cookies from Google to deliver and enhance the quality of its services and to analyze traffic . Learn more.OK , got it1 . Create a box to fade i...
Scores: {'final': 0.928, 'dense': 0.794, 'sparse': 1.0, 'rerank': 1.0}

Result 2:
URL: https://docs.flutter.dev/cookbook/animation/opacity-animation
Text: const MyApp ( ) ) ; class MyApp extends StatelessWidget { Widget build ( BuildContext context ) { const appTitle = 'Opacity Demo ' ; home : MyHomePage ( title : appTitle ) , // The StatefulWidget 's j...
Scores: {'final': 0.867, 'dense': 0.728, 'sparse': 0.937,

In [None]:
pip install -e .