# Saka-14B RAG System on Google Colab

**Efficient Setup for Arabic Mental Health QA**

## Quick Start
1. Select **A100 GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU ‚Üí A100
2. Run all cells sequentially
3. The notebook will:
   - Install dependencies
   - Build/load knowledge base once
   - Build vector database once  
   - Initialize RAG pipeline
   - Ready to answer questions!

## Important Notes
- KB and Vector DB are built once and reused
- No chunk filtering - generator is instructed to avoid Quranic content
- Generator uses only Arabic language


## Step 1: Install Dependencies & Setup


In [None]:
# Install required packages
%pip install -q torch>=2.0.0 transformers>=4.40.0 accelerate>=0.20.0 bitsandbytes>=0.41.0
%pip install -q sentence-transformers>=2.2.2 chromadb>=0.4.0 rank-bm25
%pip install -q numpy pandas tqdm

# Setup Python path
import sys
import os

# Add current directory to Python path
if 'colab_files' in os.listdir('.'):
    sys.path.insert(0, 'colab_files')
elif 'knowldege_base' in os.listdir('.'):
    sys.path.insert(0, '.')
else:
    print("‚ö†Ô∏è  Please upload colab_files/ directory or ensure knowldege_base/ is in current directory")

print(f"‚úÖ Python path configured: {sys.path[0]}")


## Step 2: Verify GPU


In [None]:
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected! Please select A100 GPU runtime.")


## Step 3: Build Knowledge Base Chunks (One Time)


In [None]:
# Build KB chunks from articles, books, and QA pairs
# This only needs to run once - chunks are saved to disk

from knowldege_base.rag_staging.kb_chunker import build_kb_chunks
import os

chunks_file = 'knowldege_base/data/processed/kb_chunks.jsonl'

if os.path.exists(chunks_file):
    print(f"‚úÖ KB chunks already exist: {chunks_file}")
    print("   Skipping rebuild. Delete the file if you want to rebuild.")
else:
    print("Building KB chunks...")
    print("This will read articles, books, and QA pairs and create chunks.")
    build_kb_chunks(output_filename="kb_chunks.jsonl")
    print(f"‚úÖ KB chunks built: {chunks_file}")


## Step 4: Build Vector Database (One Time)


In [None]:
# Build vector database on disk
# This only needs to run once - vector DB is saved to disk

from knowldege_base.rag_staging.vector_db import VectorDB
import os

vector_db_path = 'knowldege_base/data/vector_db'

if os.path.exists(vector_db_path):
    try:
        import chromadb
        client = chromadb.PersistentClient(path=vector_db_path)
        collections = client.list_collections()
        if collections:
            print(f"‚úÖ Vector database exists with {len(collections)} collection(s)")
            print(f"   Location: {vector_db_path}")
            print("   Skipping rebuild. Delete the directory if you want to rebuild.")
        else:
            print("‚ö†Ô∏è  Vector DB directory exists but no collections found. Rebuilding...")
            VectorDB.build(force_rebuild=True)
            print("‚úÖ Vector database rebuilt!")
    except Exception as e:
        print(f"‚ö†Ô∏è  Error checking vector DB: {e}")
        print("   Rebuilding...")
        VectorDB.build(force_rebuild=True)
        print("‚úÖ Vector database rebuilt!")
else:
    print("Building vector database on disk...")
    print("This will take a few minutes but will persist across sessions.")
    VectorDB.build(force_rebuild=False)
    print(f"‚úÖ Vector database built: {vector_db_path}")


## Step 5: Initialize RAG Pipeline with Saka-14B


In [None]:
from knowldege_base.rag_staging.rag_qa import RAGQAPipeline

print("=" * 80)
print("Initializing Saka-14B RAG Pipeline...")
print("=" * 80)
print("This will:")
print("  1. Load the Saka-14B model (~28GB) - may take 10-15 minutes on first run")
print("  2. Load the knowledge base (articles, books, QA pairs)")
print("  3. Load the vector database")
print("  4. Ready to answer questions!")
print("=" * 80)

# Initialize RAG with Saka-14B
# On A100, we can use full precision
rag = RAGQAPipeline.build(
    model_name="Sakalti/Saka-14B",
    use_gpu=True,
    load_in_4bit=False,  # A100 has enough VRAM
    load_in_8bit=False,
    max_new_tokens=512,  # Longer, more complete answers
    download_to_local=False,  # Don't save to local (Colab disk is limited)
)

print("\n" + "=" * 80)
print("‚úÖ RAG Pipeline Ready!")
print("=" * 80)
print("The generator is configured to:")
print("  - Use only Arabic language")
print("  - Avoid Quranic verses (instructed in prompt)")
print("  - Focus on medical/psychological information")
print("=" * 80)


## Step 5.5: Apply Critical Fixes (Device Mismatch & Quran Filtering)


In [None]:
# Comprehensive fix: Edit rag_qa.py and force reload
import importlib
import sys
import re

rag_qa_path = 'knowldege_base/rag_staging/rag_qa.py'

# Read the file
with open(rag_qa_path, 'r', encoding='utf-8') as f:
    content = f.read()

fixes_applied = []

# Fix 1: Device mismatch - ensure tokens are on model's device
if 'model_device = next(model.parameters()).device' not in content:
    # Find and replace the device assignment
    pattern = r'(if self\.device != "cpu":\s+# Try to move to device, but if using device_map="auto", model\.generate handles it\s+if not self\._using_device_map_auto:\s+prompt_tokens = prompt_tokens\.to\(self\.device\))'
    replacement = '''if self.device != "cpu":
                    # Get the device of the model's first parameter
                    model_device = next(model.parameters()).device
                    prompt_tokens = prompt_tokens.to(model_device)'''
    
    if re.search(pattern, content, re.MULTILINE | re.DOTALL):
        content = re.sub(pattern, replacement, content, flags=re.MULTILINE | re.DOTALL)
        fixes_applied.append("Device mismatch fix")
    else:
        # Try alternative pattern
        old_code = '''if self.device != "cpu":
                    # Try to move to device, but if using device_map="auto", model.generate handles it
                    if not self._using_device_map_auto:
                        prompt_tokens = prompt_tokens.to(self.device)'''
        new_code = '''if self.device != "cpu":
                    # Get the device of the model's first parameter
                    model_device = next(model.parameters()).device
                    prompt_tokens = prompt_tokens.to(model_device)'''
        if old_code in content:
            content = content.replace(old_code, new_code)
            fixes_applied.append("Device mismatch fix")

# Fix 2: Add min_length for longer answers
if 'min_length=min_length' not in content:
    pattern = r'(generated_tokens = model\.generate\(\s+prompt_tokens,\s+max_new_tokens=self\.max_new_tokens,)'
    replacement = '''# Calculate minimum length to ensure substantial answers
                    min_length = input_length + max(50, int(self.max_new_tokens * 0.4))
                    
                    generated_tokens = model.generate(
                        prompt_tokens,
                        max_new_tokens=self.max_new_tokens,
                        min_length=min_length,'''
    if re.search(pattern, content):
        content = re.sub(pattern, replacement, content)
        fixes_applied.append("Minimum length for longer answers")

# Fix 3: Add Quran filtering
if 'is_quranic =' not in content or 'quran_verse_pattern' not in content:
    quran_filter = '''        # CRITICAL: Filter out Quranic content to avoid misuse
        import re
        # Pattern 1: Verse markers like [ÿ∫ÿßŸÅÿ± Ÿ¶Ÿ†], [ÿßŸÑÿ®ŸÇÿ±ÿ© Ÿ¢Ÿ•Ÿ•], etc.
        quran_verse_pattern = r'\\[[^\\]]*(?:ÿ∫ÿßŸÅÿ±|ÿßŸÑÿ®ŸÇÿ±ÿ©|ÿßŸÑŸÜÿ≥ÿßÿ°|ÿßŸÑŸÖÿßÿ¶ÿØÿ©|ÿßŸÑÿ£ŸÜÿπÿßŸÖ|ÿßŸÑÿ£ÿπÿ±ÿßŸÅ|ÿßŸÑÿ™Ÿàÿ®ÿ©|ŸäŸàŸÜÿ≥|ŸáŸàÿØ|ŸäŸàÿ≥ŸÅ|ÿ•ÿ®ÿ±ÿßŸáŸäŸÖ|ÿßŸÑŸÜÿ≠ŸÑ|ŸÖÿ±ŸäŸÖ|ÿ∑Ÿá|ÿßŸÑÿ£ŸÜÿ®Ÿäÿßÿ°|ÿßŸÑÿ≠ÿ¨|ÿßŸÑŸÜŸàÿ±|ÿßŸÑŸÅÿ±ŸÇÿßŸÜ|ÿßŸÑÿ¥ÿπÿ±ÿßÿ°|ÿßŸÑŸÜŸÖŸÑ|ÿßŸÑŸÇÿµÿµ|ÿßŸÑÿπŸÜŸÉÿ®Ÿàÿ™|ÿßŸÑÿ±ŸàŸÖ|ŸÑŸÇŸÖÿßŸÜ|ÿßŸÑÿ≥ÿ¨ÿØÿ©|ÿßŸÑÿ£ÿ≠ÿ≤ÿßÿ®|ÿ≥ÿ®ÿ£|ŸÅÿßÿ∑ÿ±|Ÿäÿ≥|ÿßŸÑÿµÿßŸÅÿßÿ™|ÿµ|ÿßŸÑÿ≤ŸÖÿ±|ŸÅÿµŸÑÿ™|ÿßŸÑÿ¥Ÿàÿ±Ÿâ|ÿßŸÑÿ≤ÿÆÿ±ŸÅ|ÿßŸÑÿØÿÆÿßŸÜ|ÿßŸÑÿ¨ÿßÿ´Ÿäÿ©|ÿßŸÑÿ£ÿ≠ŸÇÿßŸÅ|ŸÖÿ≠ŸÖÿØ|ÿßŸÑŸÅÿ™ÿ≠|ÿßŸÑÿ≠ÿ¨ÿ±ÿßÿ™|ŸÇ|ÿßŸÑÿ∞ÿßÿ±Ÿäÿßÿ™|ÿßŸÑÿ∑Ÿàÿ±|ÿßŸÑŸÜÿ¨ŸÖ|ÿßŸÑŸÇŸÖÿ±|ÿßŸÑÿ±ÿ≠ŸÖŸÜ|ÿßŸÑŸàÿßŸÇÿπÿ©|ÿßŸÑÿ≠ÿØŸäÿØ|ÿßŸÑŸÖÿ¨ÿßÿØŸÑÿ©|ÿßŸÑÿ≠ÿ¥ÿ±|ÿßŸÑŸÖŸÖÿ™ÿ≠ŸÜÿ©|ÿßŸÑÿµŸÅ|ÿßŸÑÿ¨ŸÖÿπÿ©|ÿßŸÑŸÖŸÜÿßŸÅŸÇŸàŸÜ|ÿßŸÑÿ™ÿ∫ÿßÿ®ŸÜ|ÿßŸÑÿ∑ŸÑÿßŸÇ|ÿßŸÑÿ™ÿ≠ÿ±ŸäŸÖ|ÿßŸÑŸÖŸÑŸÉ|ÿßŸÑŸÇŸÑŸÖ|ÿßŸÑÿ≠ÿßŸÇÿ©|ÿßŸÑŸÖÿπÿßÿ±ÿ¨|ŸÜŸàÿ≠|ÿßŸÑÿ¨ŸÜ|ÿßŸÑŸÖÿ≤ŸÖŸÑ|ÿßŸÑŸÖÿØÿ´ÿ±|ÿßŸÑŸÇŸäÿßŸÖÿ©|ÿßŸÑÿ•ŸÜÿ≥ÿßŸÜ|ÿßŸÑŸÖÿ±ÿ≥ŸÑÿßÿ™|ÿßŸÑŸÜÿ®ÿ£|ÿßŸÑŸÜÿßÿ≤ÿπÿßÿ™|ÿπÿ®ÿ≥|ÿßŸÑÿ™ŸÉŸàŸäÿ±|ÿßŸÑÿßŸÜŸÅÿ∑ÿßÿ±|ÿßŸÑŸÖÿ∑ŸÅŸÅŸäŸÜ|ÿßŸÑÿßŸÜÿ¥ŸÇÿßŸÇ|ÿßŸÑÿ®ÿ±Ÿàÿ¨|ÿßŸÑÿ∑ÿßÿ±ŸÇ|ÿßŸÑÿ£ÿπŸÑŸâ|ÿßŸÑÿ∫ÿßÿ¥Ÿäÿ©|ÿßŸÑŸÅÿ¨ÿ±|ÿßŸÑÿ®ŸÑÿØ|ÿßŸÑÿ¥ŸÖÿ≥|ÿßŸÑŸÑŸäŸÑ|ÿßŸÑÿ∂ÿ≠Ÿâ|ÿßŸÑÿ¥ÿ±ÿ≠|ÿßŸÑÿ™ŸäŸÜ|ÿßŸÑÿπŸÑŸÇ|ÿßŸÑŸÇÿØÿ±|ÿßŸÑÿ®ŸäŸÜÿ©|ÿßŸÑÿ≤ŸÑÿ≤ŸÑÿ©|ÿßŸÑÿπÿßÿØŸäÿßÿ™|ÿßŸÑŸÇÿßÿ±ÿπÿ©|ÿßŸÑÿ™ŸÉÿßÿ´ÿ±|ÿßŸÑÿπÿµÿ±|ÿßŸÑŸáŸÖÿ≤ÿ©|ÿßŸÑŸÅŸäŸÑ|ŸÇÿ±Ÿäÿ¥|ÿßŸÑŸÖÿßÿπŸàŸÜ|ÿßŸÑŸÉŸàÿ´ÿ±|ÿßŸÑŸÉÿßŸÅÿ±ŸàŸÜ|ÿßŸÑŸÜÿµÿ±|ÿßŸÑŸÖÿ≥ÿØ|ÿßŸÑÿ•ÿÆŸÑÿßÿµ|ÿßŸÑŸÅŸÑŸÇ|ÿßŸÑŸÜÿßÿ≥)[^\\]]*\\]'
        # Pattern 2: Quranic verse markers Ô¥æ and Ô¥ø
        quran_markers = r'[Ô¥æÔ¥ø]'
        # Pattern 3: Common Quranic diacritics patterns
        quran_diacritics = r'[ÿ®ŸéÿßÿØŸéÿ™Ÿê€å|ÿ≥Ÿé€åŸéÿØ€°ÿÆŸèŸÑŸèŸàŸÜŸé|ÿ¨ŸéŸáŸéŸÜŸéŸëŸÖŸé|ÿØŸéÿßÿÆŸêÿ±Ÿê€åŸÜŸé]'
        
        is_quranic = (
            re.search(quran_verse_pattern, text, re.IGNORECASE) or
            re.search(quran_markers, text) or
            re.search(quran_diacritics, text)
        )
        
        if is_quranic:
            print(f"‚ö† Skipping chunk {i} - contains Quranic content (to avoid misuse)")
            continue
        
        '''
    # Insert before corruption check
    if '# Skip chunks with obviously corrupted text' in content and 'is_quranic =' not in content:
        content = content.replace(
            '# Skip chunks with obviously corrupted text',
            quran_filter + '# Skip chunks with obviously corrupted text',
            1
        )
        fixes_applied.append("Quran content filtering")

# Write back if changes were made
if fixes_applied:
    with open(rag_qa_path, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"‚úÖ Applied fixes: {', '.join(fixes_applied)}")
else:
    print("‚ÑπÔ∏è  All fixes already applied")

# Force reload ALL related modules
modules_to_reload = [
    'knowldege_base.rag_staging.rag_qa',
    'knowldege_base.rag_staging',
    'knowldege_base.rag_staging.hybrid_retriever',
]

for mod_name in modules_to_reload:
    if mod_name in sys.modules:
        importlib.reload(sys.modules[mod_name])
        print(f"‚úÖ Reloaded: {mod_name}")

# Also clear any cached bytecode
import os
import py_compile
pyc_files = []
for root, dirs, files in os.walk('knowldege_base'):
    for file in files:
        if file.endswith('.pyc') or file.endswith('__pycache__'):
            pyc_files.append(os.path.join(root, file))

for pyc in pyc_files:
    try:
        os.remove(pyc)
    except:
        pass

print("\n‚úÖ All modules reloaded! You MUST re-initialize RAG pipeline now:")
print("   rag = RAGQAPipeline.build(...)")


## Step 5.6: Build Vector Database (Remove In-Memory Fallback)


In [None]:
# Build vector database on disk (removes in-memory fallback)
import os
import shutil
from knowldege_base.rag_staging.vector_db import VectorDB

# Check if vector DB exists
vector_db_path = 'knowldege_base/data/vector_db'
if os.path.exists(vector_db_path):
    print(f"Found existing vector database at: {vector_db_path}")
    print("Checking if it's complete...")
    
    # Check if ChromaDB collection exists
    try:
        import chromadb
        client = chromadb.PersistentClient(path=vector_db_path)
        collections = client.list_collections()
        if collections:
            print(f"‚úÖ Vector database exists with {len(collections)} collection(s)")
            print("   Using existing database (no rebuild needed)")
        else:
            print("‚ö†Ô∏è  Vector database directory exists but no collections found")
            print("   Rebuilding...")
            vector_db = VectorDB.build(force_rebuild=True)
    except Exception as e:
        print(f"‚ö†Ô∏è  Error checking vector DB: {e}")
        print("   Rebuilding...")
        vector_db = VectorDB.build(force_rebuild=True)
else:
    print("Building vector database on disk...")
    print("This will take a few minutes but will persist across sessions.")
    vector_db = VectorDB.build(force_rebuild=False)

print("\n‚úÖ Vector database ready on disk!")
print(f"   Location: {vector_db_path}")
print("   This will be used instead of in-memory fallback.")
print("\nüí° When you initialize RAG, it will use this disk-based vector DB.")


## Step 5.7: Diagnostic - Check Why Chunks Are Being Filtered


In [None]:
# Diagnostic: Check what's actually in the retrieved chunks
from knowldege_base.rag_staging.hybrid_retriever import HybridKBRetriever

# Initialize retriever
retriever = HybridKBRetriever.build()

# Test query
test_query = "ÿ£ÿπÿßŸÜŸä ŸÖŸÜ ÿßŸÑŸÇŸÑŸÇ ŸàÿßŸÑÿ™Ÿàÿ™ÿ± ÿßŸÑŸÖÿ≥ÿ™ŸÖÿ±ÿå ŸÖÿß ŸáŸä ÿ∑ÿ±ŸÇ ÿßŸÑÿ™ÿπÿßŸÖŸÑ ŸÖÿπŸáÿü"

# Get top 5 chunks
results = retriever.search(test_query, top_k=5)

print("=" * 80)
print("DIAGNOSTIC: Checking Retrieved Chunks")
print("=" * 80)

for i, result in enumerate(results[:3], 1):  # Check first 3
    chunk = result.get("chunk", {})
    text = chunk.get("text", "")[:500]  # First 500 chars
    kb_family = chunk.get("kb_family", "unknown")
    
    print(f"\nChunk {i} ({kb_family}):")
    print(f"Score: {result.get('score_hybrid', 0):.4f}")
    print(f"Text preview: {text}...")
    
    # Check for Quranic patterns
    import re
    has_verse_marker = bool(re.search(r'\[[^\]]*(?:ÿ∫ÿßŸÅÿ±|ÿßŸÑÿ®ŸÇÿ±ÿ©)[^\]]*\]', text, re.IGNORECASE))
    has_markers = bool(re.search(r'[Ô¥æÔ¥ø]', text))
    has_diacritics = bool(re.search(r'ÿ®ŸéÿßÿØŸéÿ™Ÿê€å|ÿ≥Ÿé€åŸéÿØ€°ÿÆŸèŸÑŸèŸàŸÜŸé|ÿ¨ŸéŸáŸéŸÜŸéŸëŸÖŸé|ÿØŸéÿßÿÆŸêÿ±Ÿê€åŸÜŸé', text))
    
    print(f"  Has verse marker: {has_verse_marker}")
    print(f"  Has Quran markers: {has_markers}")
    print(f"  Has diacritics: {has_diacritics}")
    print(f"  Would be filtered: {has_verse_marker or has_markers}")

print("\n" + "=" * 80)


## Step 6: Test Single Query


In [None]:
# Test query
test_query = "ÿ£ÿπÿßŸÜŸä ŸÖŸÜ ÿßŸÑŸÇŸÑŸÇ ŸàÿßŸÑÿ™Ÿàÿ™ÿ± ÿßŸÑŸÖÿ≥ÿ™ŸÖÿ±ÿå ŸÖÿß ŸáŸä ÿ∑ÿ±ŸÇ ÿßŸÑÿ™ÿπÿßŸÖŸÑ ŸÖÿπŸáÿü"

print(f"Question: {test_query}\n")
print("Generating answer...\n")

result = rag.answer(
    query=test_query,
    top_k=5,
    relevance_threshold=0.5,
)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(result.answer)
print("\n" + "=" * 80)
print(f"Used KB: {result.used_kb}")
print(f"Top Score: {result.top_score:.4f}")
print(f"Avg Top Score: {result.avg_top_score:.4f}")


## Step 7: Batch Processing (Optional)

Process multiple questions from a file


## Step 6.5: Verify QA Pairs Are Loaded (Diagnostic)


In [None]:
# Diagnostic: Check if QA pairs are being loaded
from knowldege_base.rag_staging import loader

# Check file exists
import os
qa_path = "/content/MentalQA_PLM/MentalQA_PLM/knowldege_base/data/processed/shifaa_qa_pairs_all.jsonl"
print(f"QA file exists: {os.path.exists(qa_path)}")
print(f"QA file path: {qa_path}")

# Try loading documents
docs = loader.load_all_documents(
    articles_filename="articles_all.jsonl",
    books_filename="books_all_ragclean.jsonl",
    qa_filename="shifaa_qa_pairs_all.jsonl",
)

# Count by type
article_count = sum(1 for d in docs if d.get("kb_family") == "article")
book_count = sum(1 for d in docs if d.get("kb_family") == "book")
qa_count = sum(1 for d in docs if d.get("kb_family") == "qa_pair")

print(f"\nLoaded documents:")
print(f"  Articles: {article_count}")
print(f"  Books: {book_count}")
print(f"  QA Pairs: {qa_count}")
print(f"  Total: {len(docs)}")

# Check a sample QA pair
qa_samples = [d for d in docs if d.get("kb_family") == "qa_pair"][:3]
if qa_samples:
    print(f"\nSample QA pair:")
    print(f"  doc_id: {qa_samples[0].get('doc_id')}")
    print(f"  has clean_text: {'clean_text' in qa_samples[0]}")
    print(f"  clean_text length: {len(qa_samples[0].get('clean_text', ''))}")
    print(f"  question: {qa_samples[0].get('question', '')[:100]}...")
else:
    print("\n‚ö†Ô∏è  No QA pairs found in loaded documents!")


In [None]:
# Verify QA chunks are actually in the kb_chunks.jsonl file
import json

chunks_file = "/content/MentalQA_PLM/MentalQA_PLM/knowldege_base/data/processed/kb_chunks.jsonl"

qa_chunks_in_file = 0
article_chunks_in_file = 0
book_chunks_in_file = 0

# Sample first 1000 chunks to check
with open(chunks_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:  # Sample first 1000
            break
        chunk = json.loads(line)
        kb_family = chunk.get("kb_family", "")
        if kb_family == "qa_pair":
            qa_chunks_in_file += 1
        elif kb_family == "article":
            article_chunks_in_file += 1
        elif kb_family == "book":
            book_chunks_in_file += 1

print(f"Chunks in file (first 1000 sampled):")
print(f"  QA Pairs: {qa_chunks_in_file}")
print(f"  Articles: {article_chunks_in_file}")
print(f"  Books: {book_chunks_in_file}")

# Count total QA chunks
if qa_chunks_in_file == 0:
    print("\n‚ö†Ô∏è  No QA chunks found! Checking all chunks...")
    total_qa = 0
    with open(chunks_file, 'r', encoding='utf-8') as f:
        for line in f:
            chunk = json.loads(line)
            if chunk.get("kb_family") == "qa_pair":
                total_qa += 1
                if total_qa == 1:
                    print(f"\nFound first QA chunk:")
                    print(f"  chunk_id: {chunk.get('chunk_id')}")
                    print(f"  doc_id: {chunk.get('parent_doc_id')}")
                    print(f"  text preview: {chunk.get('text', '')[:200]}...")
    print(f"\nTotal QA chunks in file: {total_qa}")
else:
    print(f"\n‚úÖ QA chunks are present in the file!")


In [None]:
import json
from tqdm import tqdm

# Example: Process questions from a JSONL file
def process_batch(input_file, output_file, rag_pipeline):
    """Process a batch of questions and save answers."""
    questions = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                questions.append(json.loads(line))
    
    results = []
    for q_data in tqdm(questions, desc="Processing"):
        question = q_data.get('text', q_data.get('question', ''))
        q_id = q_data.get('id', '')
        
        result = rag_pipeline.answer(
            query=question,
            top_k=5,
            relevance_threshold=0.5,
        )
        
        results.append({
            'id': q_id,
            'question': question,
            'answer': result.answer,
            'used_kb': result.used_kb,
            'top_score': result.top_score,
            'avg_top_score': result.avg_top_score,
        })
    
    # Save results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"\n‚úÖ Processed {len(results)} questions. Saved to {output_file}")
    return results

# Uncomment to use:
# process_batch('data/qtypes/test.jsonl', 'saka14b_answers.json', rag)


## Step 8: Download Results (Optional)

Download generated answers from Colab


In [None]:
from google.colab import files

# Download results file
# files.download('saka14b_answers.json')


## Troubleshooting

### Out of Memory (OOM)
- Set `load_in_4bit=True` or `load_in_8bit=True`
- Reduce `max_new_tokens`
- Restart runtime and try again

### Import Errors
- Ensure `colab_files/` directory is uploaded
- Check that `knowldege_base/rag_staging/__init__.py` exists
- Verify Python path includes the directory

### Vector DB Not Found
- Upload `data/vector_db/` directory if you have it
- Or rebuild it using the vector_db module

### Slow Generation
- Normal for 14B model (even on A100)
- First generation is slower (model loading)
- Subsequent queries should be faster
