# 🔧 Data Preparation & Index Testing Notebook

This notebook helps you:
1. Set up the environment for both local and Google Colab
2. Ingest PDF protocols into markdown
3. Build a FAISS vector index
4. Test the index with various queries

---

## Environment Setup
Run this cell first to configure the environment.

In [2]:
# 🔧 Install dependencies
import sys
import subprocess
from pathlib import Path

def install_requirements():
    # Get the path to requirements.txt (parent directory of notebooks/)
    project_root = Path.cwd().parent
    requirements_path = project_root / 'requirements.txt'
    
    try:
        import google.colab
        print("🔄 Installing in Google Colab environment...")
        # Colab: use magic with correct path
        get_ipython().system(f'pip install -q -r {requirements_path}')
    except ImportError:
        print("🔄 Installing in local environment...")
        print(f"📁 Looking for requirements.txt at: {requirements_path}")
        if not requirements_path.exists():
            print(f"❌ Requirements file not found at {requirements_path}")
            # Try alternative paths
            alt_paths = [Path.cwd() / 'requirements.txt', Path.cwd().parent / 'requirements.txt']
            for alt_path in alt_paths:
                if alt_path.exists():
                    print(f"✅ Found requirements.txt at: {alt_path}")
                    requirements_path = alt_path
                    break
            else:
                print("❌ Could not find requirements.txt in any expected location")
                return
        
        result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', str(requirements_path)], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ Dependencies installed successfully")
        else:
            print(f"⚠️  Installation issues: {result.stderr}")

install_requirements()

🔄 Installing in local environment...
📁 Looking for requirements.txt at: /Users/rsliusarchuk/www/genai/llm_family_doctor/requirements.txt
✅ Dependencies installed successfully


In [22]:
# 🔧 Config | run once per session
import os, pathlib, textwrap, pprint

# Get project root
project_root = Path.cwd().parent

os.environ["MODEL_ID"]   = "intfloat/multilingual-e5-base"
os.environ["INDEX_PATH"] = str(project_root / "data/faiss_index")
os.environ["MAP_PATH"]   = str(project_root / "data/doc_map.pkl")

# Replace this with your actual OpenAI API key
os.environ["OPENAI_API_KEY"] = "ENTER_YOUR_OPENAI_API_KEY"
os.environ["OPENAI_MODEL"] = "gpt-4.1-nano"

env_path = project_root / ".env"
env_path.write_text(textwrap.dedent(f"""
    MODEL_ID={os.environ['MODEL_ID']}
    INDEX_PATH={os.environ['INDEX_PATH']}
    MAP_PATH={os.environ['MAP_PATH']}
    OPENAI_API_KEY={os.environ['OPENAI_API_KEY']}
    OPENAI_MODEL={os.environ['OPENAI_MODEL']}
""").strip())

print("✅  Environment configured")
pprint.pprint({k: os.environ[k] for k in ("MODEL_ID", "INDEX_PATH", "MAP_PATH", "OPENAI_API_KEY", "OPENAI_MODEL")})

✅  Environment configured
{'INDEX_PATH': '/Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index',
 'MAP_PATH': '/Users/rsliusarchuk/www/genai/llm_family_doctor/data/doc_map.pkl',
 'MODEL_ID': 'intfloat/multilingual-e5-base',
 'OPENAI_API_KEY': 'ENTER_YOUR_OPENAI_API_KEY',
 'OPENAI_MODEL': 'gpt-4.1-nano'}


## 📁 Check Data Structure
Let's verify our data directories exist and see what we have.

In [14]:
from pathlib import Path

def check_data_structure():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    directories = {
        'raw_pdfs': project_root / 'data/raw_pdfs',
        'protocols': project_root / 'data/protocols',
        'index': project_root / 'data/faiss_index',
    }
    for name, path in directories.items():
        print(f'📁 {name.upper()}: {path}')
        if path.exists():
            if path.is_dir():
                files = list(path.glob('*'))
                print(f'   ✅ Exists with {len(files)} items')
                for file in files[:5]:
                    print(f'   - {file.name}')
                if len(files) > 5:
                    print(f'   ... and {len(files) - 5} more')
            else:
                print('   ✅ File exists')
        else:
            print('   ❌ Does not exist')
        print()

check_data_structure()

📁 RAW_PDFS: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/raw_pdfs
   ✅ Exists with 37 items
   - 3803.pdf
   - 2937.pdf
   - 3354.pdf
   - 3744.pdf
   - 3023.pdf
   ... and 32 more

📁 PROTOCOLS: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/protocols
   ✅ Exists with 35 items
   - nastanova_00172_nudota_ta_blyuvannya.md
   - nastanova_00047_vaktsynatsiya.md
   - nastanova_00745_zapamorochennya.md
   - nastanova_01026_enterovirusni_infektsiyi.md
   - nastanova_00015_hryp.md
   ... and 30 more

📁 INDEX: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index
   ✅ File exists



## 📄 Ingest PDF Protocols
Convert PDF files to markdown format for processing.

In [15]:
import subprocess
import sys

def run_ingest():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    pdf_dir = project_root / 'data/raw_pdfs'
    if not pdf_dir.exists():
        print('⚠️  No data/raw_pdfs directory found. Creating it...')
        pdf_dir.mkdir(parents=True, exist_ok=True)
        print('📝 Please add PDF files to data/raw_pdfs/ and run this cell again.')
        return False
    pdf_files = list(pdf_dir.glob('*.pdf')) + list(pdf_dir.glob('**/*.pdf'))
    if not pdf_files:
        print('⚠️  No PDF files found in data/raw_pdfs/')
        print('📝 Please add PDF files and run this cell again.')
        return False
    print(f'📄 Found {len(pdf_files)} PDF files to process')
    
    # Change to project root directory before running the script
    original_cwd = Path.cwd()
    try:
        os.chdir(project_root)
        result = subprocess.run([sys.executable, 'scripts/ingest_protocol.py', '--dir', 'data/raw_pdfs', '--recursive'], capture_output=True, text=True)
    finally:
        os.chdir(original_cwd)  # Always restore original directory
    
    if result.returncode == 0:
        print('✅ PDF ingestion completed successfully')
        print(result.stdout)
        return True
    else:
        print(f'❌ Ingestion failed: {result.stderr}')
        return False

run_ingest()

📄 Found 70 PDF files to process
✅ PDF ingestion completed successfully
Done.



True

## 🔍 Build Vector Index
Create FAISS index from the markdown protocols.

In [16]:
def run_build_index():
    # Get the project root (parent of notebooks directory)
    project_root = Path.cwd().parent
    md_dir = project_root / 'data/protocols'
    if not md_dir.exists():
        print('⚠️  No data/protocols directory found.')
        print('📝 Please run the ingestion step first.')
        return False
    md_files = list(md_dir.glob('*.md'))
    if not md_files:
        print('⚠️  No markdown files found in data/protocols/')
        print('📝 Please run the ingestion step first.')
        return False
    print(f'📄 Found {len(md_files)} markdown files to index')
    
    # Change to project root directory before running the script
    original_cwd = Path.cwd()
    try:
        os.chdir(project_root)
        result = subprocess.run([sys.executable, 'src/indexing/build_index.py', '--hf-model', os.environ['MODEL_ID']], capture_output=True, text=True)
    finally:
        os.chdir(original_cwd)  # Always restore original directory
    
    if result.returncode == 0:
        print('✅ Index building completed successfully')
        print(result.stdout)
        return True
    else:
        print(f'❌ Index building failed: {result.stderr}')
        return False

run_build_index()

📄 Found 34 markdown files to index
✅ Index building completed successfully
🔹 Loading intfloat/multilingual-e5-base …
🔹 Encoding 34 documents
✅  Saved index → /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index  (vectors: 34)



True

## 🧪 Test the Index Database
Load the index and test it with various queries.

In [17]:
import faiss, pickle, numpy as np
from sentence_transformers import SentenceTransformer

def load_index():
    try:
        # Get the project root (parent of notebooks directory)
        project_root = Path.cwd().parent
        index_path = project_root / 'data/faiss_index'
        map_path = project_root / 'data/doc_map.pkl'
        
        if not index_path.exists():
            print(f'❌ Index file not found: {index_path}')
            return None, None, None
        if not map_path.exists():
            print(f'❌ Map file not found: {map_path}')
            return None, None, None
        model = SentenceTransformer(os.environ['MODEL_ID'])
        print(f'✅ Model loaded: {os.environ["MODEL_ID"]}')
        index = faiss.read_index(str(index_path))
        print(f'✅ Index loaded: {index.ntotal} documents, {index.d} dimensions')
        with open(map_path, 'rb') as f:
            doc_map = pickle.load(f)
        print(f'✅ Document map loaded: {len(doc_map)} entries')
        if index.ntotal != len(doc_map):
            print(f'⚠️  Warning: Index has {index.ntotal} docs, map has {len(doc_map)} entries')
        return model, index, doc_map
    except Exception as e:
        print(f'❌ Error loading index: {e}')
        import traceback; traceback.print_exc()
        return None, None, None

def search_documents(model, index, doc_map, query, k=3):
    try:
        vec = model.encode(query, normalize_embeddings=True).astype('float32')[None]
        D, I = index.search(vec, k)
        print(f'🔍 Query: "{query}"')
        print(f'📊 Found {len(I[0])} results:')
        for rank, (idx, score) in enumerate(zip(I[0], D[0]), 1):
            if 0 <= idx < len(doc_map):
                content = doc_map[idx]
                lines = content.split('\n')
                title = lines[0] if lines else 'No title'
                preview = content[:200].replace('\n', ' ').strip()
                print(f'   {rank}. Similarity: {score:.3f}')
                print(f'      📄 {title}')
                print(f'      📝 {preview}...')
            else:
                print(f'   ❌ Index {idx} out of range!')
            print()
    except Exception as e:
        print(f'❌ Error during search: {e}')

model, index, doc_map = load_index()
if model and index and doc_map:
    print('✅ Index loaded successfully! Ready for testing.')
else:
    print('❌ Failed to load index. Please run the previous cells first.')

  from .autonotebook import tqdm as notebook_tqdm


✅ Model loaded: intfloat/multilingual-e5-base
✅ Index loaded: 34 documents, 768 dimensions
✅ Document map loaded: 34 entries
✅ Index loaded successfully! Ready for testing.


## 🔍 Test Queries
Try different queries to test the index functionality.

In [18]:
if model and index and doc_map:
    test_queries = [
        'головний біль в скроневій ділянці'
    ]
    print('🧪 Testing index with various medical queries...\n')
    for i, query in enumerate(test_queries, 1):
        print(f'{"="*60}')
        search_documents(model, index, doc_map, query, k=2)
        if i < len(test_queries):
            print()
else:
    print('❌ Index not loaded. Please run the previous cells first.')

🧪 Testing index with various medical queries...

🔍 Query: "головний біль в скроневій ділянці"
📊 Found 2 results:
   1. Similarity: 0.871
      📄 # Настанова 00791. Головний біль напруги
      📝 # Настанова 00791. Головний біль напруги  Настанова 00791. Головний біль напруги Для коментарів чи іншого зворотного зв'язку заповніть форму: форма зворотного зв'язку щодо цієї версії настанови Версія...

   2. Similarity: 0.863
      📄 # Настанова 00743. Головний біль
      📝 # Настанова 00743. Головний біль  Настанова 00743. Головний біль Для коментарів чи іншого зворотного зв'язку заповніть форму: форма зворотного зв'язку щодо цієї версії настанови Версія цього документу...



## 🎯 Interactive Search
Test your own queries here.

In [19]:
def interactive_search():
    if not (model and index and doc_map):
        print('❌ Index not loaded. Please run the previous cells first.')
        return
    print('🎯 Interactive Search Mode')
    print('Enter your medical query (or \'quit\' to exit):')
    while True:
        try:
            query = input('\n🔍 Query: ').strip()
            if query.lower() in ['quit', 'exit', 'q']:
                print('👋 Goodbye!')
                break
            if not query:
                print('⚠️  Please enter a query.')
                continue
            try:
                k = int(input('📊 Number of results (default 3): ') or '3')
                k = max(1, min(k, 10))
            except ValueError:
                k = 3
            print()
            search_documents(model, index, doc_map, query, k)
        except KeyboardInterrupt:
            print('\n👋 Goodbye!')
            break
        except Exception as e:
            print(f'❌ Error: {e}')
# Uncomment to enable interactive search in notebook
# interactive_search()
if model and index and doc_map:
    print('🎯 Test your own query:')
    print("Example: search_documents(model, index, doc_map, 'your query here', k=3)")
else:
    print('❌ Index not loaded. Please run the previous cells first.')

🎯 Test your own query:
Example: search_documents(model, index, doc_map, 'your query here', k=3)


## 📊 Index Statistics
Get detailed information about the index.

In [20]:
def show_index_stats():
    if not (model and index and doc_map):
        print('❌ Index not loaded. Please run the previous cells first.')
        return
    print('📊 Index Statistics')
    print('=' * 50)
    print(f'📄 Total documents: {index.ntotal}')
    print(f'🔢 Vector dimensions: {index.d}')
    print(f'🗂️  Document map entries: {len(doc_map)}')
    print(f'🤖 Model: {os.environ["MODEL_ID"]}')
    print(f'💾 Index file: {os.environ["INDEX_PATH"]}')
    print(f'🗺️  Map file: {os.environ["MAP_PATH"]}')
    doc_lengths = [len(doc) for doc in doc_map]
    print(f'\n📏 Document Length Statistics:')
    print(f'   Min: {min(doc_lengths):,} characters')
    print(f'   Max: {max(doc_lengths):,} characters')
    print(f'   Avg: {sum(doc_lengths)/len(doc_lengths):,.0f} characters')
    print(f'\n📋 Sample Document Titles:')
    for i, doc in enumerate(doc_map[:5]):
        title = doc.split('\n')[0] if doc else 'No title'
        print(f'   {i+1}. {title[:60]}...')
    if len(doc_map) > 5:
        print(f'   ... and {len(doc_map) - 5} more documents')
show_index_stats()

📊 Index Statistics
📄 Total documents: 34
🔢 Vector dimensions: 768
🗂️  Document map entries: 34
🤖 Model: intfloat/multilingual-e5-base
💾 Index file: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/faiss_index
🗺️  Map file: /Users/rsliusarchuk/www/genai/llm_family_doctor/data/doc_map.pkl

📏 Document Length Statistics:
   Min: 2,000 characters
   Max: 2,000 characters
   Avg: 2,000 characters

📋 Sample Document Titles:
   1. # Настанова 00006. Інфекції дихальних шляхів у дорослих...
   2. # Настанова 00007. Фарингіт і тонзиліт...
   3. # Настанова 00015. Грип...
   4. # Настанова 00047. Вакцинація...
   5. # Настанова 00099. Набряк ніг...
   ... and 29 more documents
