# üîß Data Preparation & Index Testing Notebook

This notebook helps you:
1. Set up the environment for both local and Google Colab
2. Ingest PDF protocols into markdown
3. Build a FAISS vector index
4. Test the index with various queries

---

## Environment Setup
Run this cell first to configure the environment.

In [2]:
# üîß Install dependencies
import sys
import subprocess
from pathlib import Path

def install_requirements():
    # Get the path to requirements.txt (parent directory of notebooks/)
    project_root = Path.cwd().parent
    requirements_path = project_root / 'requirements.txt'
    
    try:
        import google.colab
        print("üîÑ Installing in Google Colab environment...")
        # Colab: use magic with correct path
        get_ipython().system(f'pip install -q -r {requirements_path}')
    except ImportError:
        print("üîÑ Installing in local environment...")
        print(f"üìÅ Looking for requirements.txt at: {requirements_path}")
        if not requirements_path.exists():
            print(f"‚ùå Requirements file not found at {requirements_path}")
            # Try alternative paths
            alt_paths = [Path.cwd() / 'requirements.txt', Path.cwd().parent / 'requirements.txt']
            for alt_path in alt_paths:
                if alt_path.exists():
                    print(f"‚úÖ Found requirements.txt at: {alt_path}")
                    requirements_path = alt_path
                    break
            else:
                print("‚ùå Could not find requirements.txt in any expected location")
                return
        
        result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', str(requirements_path)], capture_output=True, text=True)
        if result.returncode == 0:
            print("‚úÖ Dependencies installed successfully")
        else:
            print(f"‚ö†Ô∏è  Installation issues: {result.stderr}")

install_requirements()

üîÑ Installing in local environment...
üìÅ Looking for requirements.txt at: /Users/rsliusarchuk/www/genai/llm_family_doctor/requirements.txt
‚úÖ Dependencies installed successfully


In [22]:
# üîß Config | run once per session
import os, pathlib, textwrap, pprint

# Get project root
project_root = Path.cwd().parent

os.environ["MODEL_ID"]   = "intfloat/multilingual-e5-base"
os.environ["INDEX_PATH"] = str(project_root / "data/faiss_index")
os.environ["MAP_PATH"]   = str(project_root / "data/doc_map.pkl")

# Replace this with your actual OpenAI API key
os.environ["OPENAI_API_KEY"] = "ENTER_YOUR_OPENAI_API_KEY"
os.environ["OPENAI_MODEL"] = "gpt-4.1-nano"

env_path = project_root / ".env"
env_path.write_text(textwrap.dedent(f"""
    MODEL_ID={os.environ['MODEL_ID']}
    INDEX_PATH={os.environ['INDEX_PATH']}
    MAP_PATH={os.environ['MAP_PATH']}
    OPENAI_API_KEY={os.environ['OPENAI_API_KEY']}
    OPENAI_MODEL={os.environ['OPENAI_MODEL']}
""").strip())

print("‚úÖ  Environment configured")
pprint.pprint({k: os.environ[k] for k in ("MODEL_ID", "INDEX_PATH", "MAP_PATH", "OPENAI_API_KEY", "OPENAI_MODEL")})

‚úÖ  Environment configured
{'INDEX_PATH': '/Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index',
 'MAP_PATH': '/Users/rsliusarchuk/www/genai/llm_family_doctor/data/doc_map.pkl',
 'MODEL_ID': 'intfloat/multilingual-e5-base',
 'OPENAI_API_KEY': 'ENTER_YOUR_OPENAI_API_KEY',
 'OPENAI_MODEL': 'gpt-4.1-nano'}


## üìÅ Check Data Structure
Let's verify our data directories exist and see what we have.

In [14]:
from pathlib import Path

def check_data_structure():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    directories = {
        'raw_pdfs': project_root / 'data/raw_pdfs',
        'protocols': project_root / 'data/protocols',
        'index': project_root / 'data/faiss_index',
    }
    for name, path in directories.items():
        print(f'üìÅ {name.upper()}: {path}')
        if path.exists():
            if path.is_dir():
                files = list(path.glob('*'))
                print(f'   ‚úÖ Exists with {len(files)} items')
                for file in files[:5]:
                    print(f'   - {file.name}')
                if len(files) > 5:
                    print(f'   ... and {len(files) - 5} more')
            else:
                print('   ‚úÖ File exists')
        else:
            print('   ‚ùå Does not exist')
        print()

check_data_structure()

üìÅ RAW_PDFS: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/raw_pdfs
   ‚úÖ Exists with 37 items
   - 3803.pdf
   - 2937.pdf
   - 3354.pdf
   - 3744.pdf
   - 3023.pdf
   ... and 32 more

üìÅ PROTOCOLS: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/protocols
   ‚úÖ Exists with 35 items
   - nastanova_00172_nudota_ta_blyuvannya.md
   - nastanova_00047_vaktsynatsiya.md
   - nastanova_00745_zapamorochennya.md
   - nastanova_01026_enterovirusni_infektsiyi.md
   - nastanova_00015_hryp.md
   ... and 30 more

üìÅ INDEX: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index
   ‚úÖ File exists



## üìÑ Ingest PDF Protocols
Convert PDF files to markdown format for processing.

In [15]:
import subprocess
import sys

def run_ingest():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    pdf_dir = project_root / 'data/raw_pdfs'
    if not pdf_dir.exists():
        print('‚ö†Ô∏è  No data/raw_pdfs directory found. Creating it...')
        pdf_dir.mkdir(parents=True, exist_ok=True)
        print('üìù Please add PDF files to data/raw_pdfs/ and run this cell again.')
        return False
    pdf_files = list(pdf_dir.glob('*.pdf')) + list(pdf_dir.glob('**/*.pdf'))
    if not pdf_files:
        print('‚ö†Ô∏è  No PDF files found in data/raw_pdfs/')
        print('üìù Please add PDF files and run this cell again.')
        return False
    print(f'üìÑ Found {len(pdf_files)} PDF files to process')
    
    # Change to project root directory before running the script
    original_cwd = Path.cwd()
    try:
        os.chdir(project_root)
        result = subprocess.run([sys.executable, 'scripts/ingest_protocol.py', '--dir', 'data/raw_pdfs', '--recursive'], capture_output=True, text=True)
    finally:
        os.chdir(original_cwd)  # Always restore original directory
    
    if result.returncode == 0:
        print('‚úÖ PDF ingestion completed successfully')
        print(result.stdout)
        return True
    else:
        print(f'‚ùå Ingestion failed: {result.stderr}')
        return False

run_ingest()

üìÑ Found 70 PDF files to process
‚úÖ PDF ingestion completed successfully
Done.



True

## üîç Build Vector Index
Create FAISS index from the markdown protocols.

In [16]:
def run_build_index():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    md_dir = project_root / 'data/protocols'
    if not md_dir.exists():
        print('‚ö†Ô∏è  No data/protocols directory found.')
        print('üìù Please run the ingestion step first.')
        return False
    md_files = list(md_dir.glob('*.md'))
    if not md_files:
        print('‚ö†Ô∏è  No markdown files found in data/protocols/')
        print('üìù Please run the ingestion step first.')
        return False
    print(f'üìÑ Found {len(md_files)} markdown files to index')
    
    # Change to project root directory before running the script
    original_cwd = Path.cwd()
    try:
        os.chdir(project_root)
        result = subprocess.run([sys.executable, 'src/indexing/build_index.py', '--hf-model', os.environ['MODEL_ID']], capture_output=True, text=True)
    finally:
        os.chdir(original_cwd)  # Always restore original directory
    
    if result.returncode == 0:
        print('‚úÖ Index building completed successfully')
        print(result.stdout)
        return True
    else:
        print(f'‚ùå Index building failed: {result.stderr}')
        return False

run_build_index()

üìÑ Found 34 markdown files to index
‚úÖ Index building completed successfully
üîπ Loading intfloat/multilingual-e5-base ‚Ä¶
üîπ Encoding 34 documents
‚úÖ  Saved index ‚Üí /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index  (vectors: 34)



True

## üß™ Test the Index Database
Load the index and test it with various queries.

In [17]:
import faiss, pickle, numpy as np
from sentence_transformers import SentenceTransformer

def load_index():
    try:
        # Get the project root (parent of notebooks directory)
        project_root = Path.cwd().parent
        index_path = project_root / 'data/faiss_index'
        map_path = project_root / 'data/doc_map.pkl'
        
        if not index_path.exists():
            print(f'‚ùå Index file not found: {index_path}')
            return None, None, None
        if not map_path.exists():
            print(f'‚ùå Map file not found: {map_path}')
            return None, None, None
        model = SentenceTransformer(os.environ['MODEL_ID'])
        print(f'‚úÖ Model loaded: {os.environ["MODEL_ID"]}')
        index = faiss.read_index(str(index_path))
        print(f'‚úÖ Index loaded: {index.ntotal} documents, {index.d} dimensions')
        with open(map_path, 'rb') as f:
            doc_map = pickle.load(f)
        print(f'‚úÖ Document map loaded: {len(doc_map)} entries')
        if index.ntotal != len(doc_map):
            print(f'‚ö†Ô∏è  Warning: Index has {index.ntotal} docs, map has {len(doc_map)} entries')
        return model, index, doc_map
    except Exception as e:
        print(f'‚ùå Error loading index: {e}')
        import traceback; traceback.print_exc()
        return None, None, None

def search_documents(model, index, doc_map, query, k=3):
    try:
        vec = model.encode(query, normalize_embeddings=True).astype('float32')[None]
        D, I = index.search(vec, k)
        print(f'üîç Query: "{query}"')
        print(f'üìä Found {len(I[0])} results:')
        for rank, (idx, score) in enumerate(zip(I[0], D[0]), 1):
            if 0 <= idx < len(doc_map):
                content = doc_map[idx]
                lines = content.split('\n')
                title = lines[0] if lines else 'No title'
                preview = content[:200].replace('\n', ' ').strip()
                print(f'   {rank}. Similarity: {score:.3f}')
                print(f'      üìÑ {title}')
                print(f'      üìù {preview}...')
            else:
                print(f'   ‚ùå Index {idx} out of range!')
            print()
    except Exception as e:
        print(f'‚ùå Error during search: {e}')

model, index, doc_map = load_index()
if model and index and doc_map:
    print('‚úÖ Index loaded successfully! Ready for testing.')
else:
    print('‚ùå Failed to load index. Please run the previous cells first.')

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Model loaded: intfloat/multilingual-e5-base
‚úÖ Index loaded: 34 documents, 768 dimensions
‚úÖ Document map loaded: 34 entries
‚úÖ Index loaded successfully! Ready for testing.


## üîç Test Queries
Try different queries to test the index functionality.

In [18]:
if model and index and doc_map:
    test_queries = [
        '–≥–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –≤ —Å–∫—Ä–æ–Ω–µ–≤—ñ–π –¥—ñ–ª—è–Ω—Ü—ñ'
    ]
    print('üß™ Testing index with various medical queries...\n')
    for i, query in enumerate(test_queries, 1):
        print(f'{"="*60}')
        search_documents(model, index, doc_map, query, k=2)
        if i < len(test_queries):
            print()
else:
    print('‚ùå Index not loaded. Please run the previous cells first.')

üß™ Testing index with various medical queries...

üîç Query: "–≥–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –≤ —Å–∫—Ä–æ–Ω–µ–≤—ñ–π –¥—ñ–ª—è–Ω—Ü—ñ"
üìä Found 2 results:
   1. Similarity: 0.871
      üìÑ # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00791. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –Ω–∞–ø—Ä—É–≥–∏
      üìù # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00791. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –Ω–∞–ø—Ä—É–≥–∏  –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00791. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –Ω–∞–ø—Ä—É–≥–∏ –î–ª—è –∫–æ–º–µ–Ω—Ç–∞—Ä—ñ–≤ —á–∏ —ñ–Ω—à–æ–≥–æ –∑–≤–æ—Ä–æ—Ç–Ω–æ–≥–æ –∑–≤'—è–∑–∫—É –∑–∞–ø–æ–≤–Ω—ñ—Ç—å —Ñ–æ—Ä–º—É: —Ñ–æ—Ä–º–∞ –∑–≤–æ—Ä–æ—Ç–Ω–æ–≥–æ –∑–≤'—è–∑–∫—É —â–æ–¥–æ —Ü—ñ—î—ó –≤–µ—Ä—Å—ñ—ó –Ω–∞—Å—Ç–∞–Ω–æ–≤–∏ –í–µ—Ä—Å—ñ—è...

   2. Similarity: 0.863
      üìÑ # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00743. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å
      üìù # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00743. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å  –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00743. –ì–æ–ª–æ–≤–Ω–∏–π –±—ñ–ª—å –î–ª—è –∫–æ–º–µ–Ω—Ç–∞—Ä—ñ–≤ —á–∏ —ñ–Ω—à–æ–≥–æ –∑–≤–æ—Ä–æ—Ç–Ω–æ–≥–æ –∑–≤'—è–∑–∫—É –∑–∞–ø–æ–≤–Ω—ñ—Ç—å —Ñ–æ—Ä–º—É: —Ñ–æ—Ä–º–∞ –∑–≤–æ—Ä–æ—Ç–Ω–æ–≥–æ –∑–≤'—è–∑–∫—É —â–æ–¥–

## üéØ Interactive Search
Test your own queries here.

In [19]:
def interactive_search():
    if not (model and index and doc_map):
        print('‚ùå Index not loaded. Please run the previous cells first.')
        return
    print('üéØ Interactive Search Mode')
    print('Enter your medical query (or \'quit\' to exit):')
    while True:
        try:
            query = input('\nüîç Query: ').strip()
            if query.lower() in ['quit', 'exit', 'q']:
                print('üëã Goodbye!')
                break
            if not query:
                print('‚ö†Ô∏è  Please enter a query.')
                continue
            try:
                k = int(input('üìä Number of results (default 3): ') or '3')
                k = max(1, min(k, 10))
            except ValueError:
                k = 3
            print()
            search_documents(model, index, doc_map, query, k)
        except KeyboardInterrupt:
            print('\nüëã Goodbye!')
            break
        except Exception as e:
            print(f'‚ùå Error: {e}')
# Uncomment to enable interactive search in notebook
# interactive_search()
if model and index and doc_map:
    print('üéØ Test your own query:')
    print("Example: search_documents(model, index, doc_map, 'your query here', k=3)")
else:
    print('‚ùå Index not loaded. Please run the previous cells first.')

üéØ Test your own query:
Example: search_documents(model, index, doc_map, 'your query here', k=3)


## üìä Index Statistics
Get detailed information about the index.

In [20]:
def show_index_stats():
    if not (model and index and doc_map):
        print('‚ùå Index not loaded. Please run the previous cells first.')
        return
    print('üìä Index Statistics')
    print('=' * 50)
    print(f'üìÑ Total documents: {index.ntotal}')
    print(f'üî¢ Vector dimensions: {index.d}')
    print(f'üóÇÔ∏è  Document map entries: {len(doc_map)}')
    print(f'ü§ñ Model: {os.environ["MODEL_ID"]}')
    print(f'üíæ Index file: {os.environ["INDEX_PATH"]}')
    print(f'üó∫Ô∏è  Map file: {os.environ["MAP_PATH"]}')
    doc_lengths = [len(doc) for doc in doc_map]
    print(f'\nüìè Document Length Statistics:')
    print(f'   Min: {min(doc_lengths):,} characters')
    print(f'   Max: {max(doc_lengths):,} characters')
    print(f'   Avg: {sum(doc_lengths)/len(doc_lengths):,.0f} characters')
    print(f'\nüìã Sample Document Titles:')
    for i, doc in enumerate(doc_map[:5]):
        title = doc.split('\n')[0] if doc else 'No title'
        print(f'   {i+1}. {title[:60]}...')
    if len(doc_map) > 5:
        print(f'   ... and {len(doc_map) - 5} more documents')
show_index_stats()

üìä Index Statistics
üìÑ Total documents: 34
üî¢ Vector dimensions: 768
üóÇÔ∏è  Document map entries: 34
ü§ñ Model: intfloat/multilingual-e5-base
üíæ Index file: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index
üó∫Ô∏è  Map file: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/doc_map.pkl

üìè Document Length Statistics:
   Min: 2,000 characters
   Max: 2,000 characters
   Avg: 2,000 characters

üìã Sample Document Titles:
   1. # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00006. –Ü–Ω—Ñ–µ–∫—Ü—ñ—ó –¥–∏—Ö–∞–ª—å–Ω–∏—Ö —à–ª—è—Ö—ñ–≤ —É –¥–æ—Ä–æ—Å–ª–∏—Ö...
   2. # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00007. –§–∞—Ä–∏–Ω–≥—ñ—Ç —ñ —Ç–æ–Ω–∑–∏–ª—ñ—Ç...
   3. # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00015. –ì—Ä–∏–ø...
   4. # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00047. –í–∞–∫—Ü–∏–Ω–∞—Ü—ñ—è...
   5. # –ù–∞—Å—Ç–∞–Ω–æ–≤–∞ 00099. –ù–∞–±—Ä—è–∫ –Ω—ñ–≥...
   ... and 29 more documents
