   "# PDF RAG Parameter Optimization Exploration
",
    "
",
    "This notebook demonstrates the parameter optimization system for RAG with Qdrant cloud integration. The focus is on finding optimal parameters for production RAG systems."

## Setup and Configuration

In [None]:
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path().absolute()
sys.path.insert(0, str(project_root))

# Import our modules
from config import Config
from processor import QdrantProcessor
from simple_rag import QdrantRAG
from parameter_tuning import ParameterTuner

print(f"Project root: {project_root}")
print(f"Python path: {sys.path[0]}")

## Environment Validation

In [None]:
# Validate environment variables
try:
    Config.validate_env_vars()
    print("✅ Environment variables validated")
    print(f"OpenAI API Key: {Config.OPENAI_API_KEY[:20]}...")
    print(f"Qdrant URL: {Config.QDRANT_URL}")
except ValueError as e:
    print(f"❌ Environment validation failed: {e}")

## PDF Processing

In [None]:
# Initialize processor
processor = QdrantProcessor(
    chunk_size=1000,
    chunk_overlap=200,
    embedding_model="text-embedding-3-small",
    collection_name="pdf_documents_notebook"
)

print(f"Processor initialized with:")
print(f"  Chunk size: {processor.chunk_size}")
print(f"  Chunk overlap: {processor.chunk_overlap}")
print(f"  Embedding model: {processor.embedding_model}")
print(f"  Collection: {processor.collection_name}")

In [None]:
# Process PDF
pdf_path = Config.DATA_DIR / "random_machine_learing_pdf.pdf"

if pdf_path.exists():
    print(f"Processing PDF: {pdf_path}")
    success = processor.process_pdf_to_qdrant(str(pdf_path))
    
    if success:
        print("✅ PDF processed successfully")
        collection_info = processor.get_collection_info()
        print(f"Collection info: {collection_info}")
    else:
        print("❌ Failed to process PDF")
else:
    print(f"❌ PDF file not found: {pdf_path}")

## RAG System Testing

In [None]:
# Initialize RAG system
rag = QdrantRAG(
    collection_name="pdf_documents_notebook",
    llm_model="gpt-3.5-turbo",
    temperature=0.0,
    top_k=5
)

print("RAG system initialized")
print(f"Collection stats: {rag.get_collection_stats()}")

In [None]:
# Test questions
test_questions = [
    "What is machine learning?",
    "What are the types of learning?",
    "What is PAC learning?",
    "Describe the goal of reinforcement learning.",
    "What is entropy in decision trees?"
]

results = []

for question in test_questions:
    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    
    result = rag.answer_question(question)
    
    print(f"Answer: {result['answer']}")
    print(f"\nMetrics:")
    print(f"  Retrieved documents: {result['retrieved_documents']}")
    print(f"  Average relevance: {result['average_relevance_score']:.3f}")
    print(f"  Sources: {result['sources'][:2]}...")  # Show first 2 sources
    
    results.append(result)

## Results Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create results dataframe
results_data = []
for result in results:
    results_data.append({
        'question': result['question'][:50] + '...',  # Truncate for display
        'answer_length': len(result['answer']),
        'retrieved_docs': result['retrieved_documents'],
        'avg_relevance': result['average_relevance_score']
    })

df_results = pd.DataFrame(results_data)
print("Results Summary:")
print(df_results)

In [None]:
# Plot relevance scores
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(range(len(df_results)), df_results['avg_relevance'])
plt.title('Average Relevance Scores')
plt.xlabel('Question Index')
plt.ylabel('Relevance Score')
plt.xticks(range(len(df_results)), [f'Q{i+1}' for i in range(len(df_results))])

plt.subplot(1, 2, 2)
plt.bar(range(len(df_results)), df_results['retrieved_docs'])
plt.title('Retrieved Documents Count')
plt.xlabel('Question Index')
plt.ylabel('Document Count')
plt.xticks(range(len(df_results)), [f'Q{i+1}' for i in range(len(df_results))])

plt.tight_layout()
plt.show()

## Parameter Tuning (Small Scale)

In [None]:
# Small scale parameter tuning for demonstration
tuner = ParameterTuner()

# Override config for smaller test
tuner.config = {
    "chunk_sizes": [500, 1000],
    "chunk_overlaps": [100, 200], 
    "embedding_models": ["text-embedding-3-small"],
    "llm_models": ["gpt-3.5-turbo"],
    "temperatures": [0.0, 0.3],
    "top_k_retrieval": [3, 5],
    "experiment_settings": {
        "max_combinations": 4,
        "random_seed": 42
    }
}

print("Starting small-scale parameter tuning...")
print(f"Will test {tuner.config['experiment_settings']['max_combinations']} combinations")

# Generate combinations
combinations = tuner.generate_parameter_combinations(max_combinations=4)
print(f"\nParameter combinations to test:")
for i, combo in enumerate(combinations):
    print(f"  {i+1}: {combo}")

## Custom Query Testing

In [None]:
# Interactive query testing
def test_custom_query(question, top_k=5):
    """Test a custom query with detailed output"""
    print(f"\n🔍 Testing Query: {question}")
    print("=" * 60)
    
    # Get detailed retrieval results
    retrieved_docs = rag.retrieve_documents(question, top_k=top_k)
    
    print(f"\n📄 Retrieved {len(retrieved_docs)} documents:")
    for i, doc in enumerate(retrieved_docs):
        print(f"\nDocument {i+1} (Score: {doc.score:.3f}):")
        print(f"  Content: {doc.content[:200]}...")
        print(f"  Metadata: {doc.metadata}")
    
    # Get full answer
    result = rag.answer_question(question, top_k=top_k)
    
    print(f"\n💡 Final Answer:")
    print(result['answer'])
    
    return result

# Test a custom question
custom_result = test_custom_query("What is the difference between supervised and unsupervised learning?", top_k=3)

## System Performance Analysis

In [None]:
import time

# Performance testing
performance_questions = [
    "What is machine learning?",
    "Define entropy.",
    "What is reinforcement learning?"
]

performance_results = []

for question in performance_questions:
    start_time = time.time()
    result = rag.answer_question(question)
    end_time = time.time()
    
    performance_results.append({
        'question': question,
        'response_time': end_time - start_time,
        'answer_length': len(result['answer']),
        'retrieved_docs': result['retrieved_documents'],
        'relevance_score': result['average_relevance_score']
    })

perf_df = pd.DataFrame(performance_results)
print("Performance Analysis:")
print(perf_df)
print(f"\nAverage response time: {perf_df['response_time'].mean():.2f} seconds")
print(f"Average relevance score: {perf_df['relevance_score'].mean():.3f}")

## Summary and Next Steps

In [None]:
print("📊 RAG System Summary")
print("=" * 50)
print(f"✅ PDF processed and stored in Qdrant")
print(f"✅ RAG system operational")
print(f"✅ {len(test_questions)} test questions processed")
print(f"✅ Average relevance score: {df_results['avg_relevance'].mean():.3f}")
print(f"✅ Average response time: {perf_df['response_time'].mean():.2f}s")

print("\n🚀 Next Steps:")
print("1. Run full parameter tuning: python main.py tune")
print("2. Launch Streamlit app: python main.py app")
print("3. Try interactive CLI: python main.py query --interactive")
print("4. Add more PDFs to data/ folder and reprocess")
print("5. Customize prompts and evaluation metrics")