# Experiment 4: RAG + Tool Use

In [None]:
# Setup
import sys
import json
import os
from pathlib import Path
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Wolfram Alpha
import requests

# RAG
import chromadb
from chromadb.utils import embedding_functions

# LLM
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

sys.path.append('..')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Imports loaded")

In [None]:
# Configuration
DB_PATH = Path("../data/vector_db")
MODEL_PATH = Path("/home/sskaplun/study/genAI/kaggle/models/gemma-2-9b-it")
OUTPUT_DIR = Path("../evaluation/experiment_04")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

COLLECTION_NAME = "ukrainian_math"
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# Wolfram Alpha API
WOLFRAM_APP_ID = 'YL2L8P8W5J'

# RAG parameters
TOP_K = 5
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 600

print(f"Wolfram API: {'Configured' if WOLFRAM_APP_ID != 'DEMO' else 'DEMO mode (limited)'}")
print(f"CUDA: {torch.cuda.is_available()}")

In [None]:
@dataclass
class ToolCall:
    tool_name: str
    query: str
    result: str
    success: bool

@dataclass
class RAGToolResponse:
    question: str
    answer: str
    citations: List[str]
    tool_calls: List[ToolCall]  # NEW: track tool usage
    avg_relevance: float
    answer_length: int
    verified: bool  # NEW: was answer computationally verified?
    
    def to_dict(self):
        return {
            'question': self.question,
            'answer': self.answer,
            'citations': self.citations,
            'tool_calls': [asdict(t) for t in self.tool_calls],
            'avg_relevance': self.avg_relevance,
            'answer_length': self.answer_length,
            'verified': self.verified
        }

print("Dataclasses defined")

## 1. Load Vector Database

In [None]:
print("="*80)
print("LOADING VECTOR DATABASE")
print("="*80)

client = chromadb.PersistentClient(path=str(DB_PATH))
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL
)
collection = client.get_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_function
)

print(f"Collection loaded: {collection.count():,} chunks")

## 2. Load LLM

In [None]:
print("="*80)
print("LOADING LLM")
print("="*80)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH))
model = AutoModelForCausalLM.from_pretrained(
    str(MODEL_PATH),
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)

print("Model loaded")

## 3. Wolfram Alpha Tool

In [None]:
def query_wolfram_alpha(query: str, timeout: int = 10) -> ToolCall:
    """
    Query Wolfram Alpha API for mathematical computations.
    
    Args:
        query: Natural language or symbolic math query
        timeout: Request timeout in seconds
    
    Returns:
        ToolCall with result or error message
    """
    if WOLFRAM_APP_ID == 'DEMO':
        # DEMO mode: simulate responses
        return ToolCall(
            tool_name="wolfram_alpha",
            query=query,
            result=f"DEMO: Would compute '{query}' (set WOLFRAM_APP_ID for real queries)",
            success=False
        )
    
    try:
        # Wolfram Alpha Simple API (HTTPS required)
        url = "https://api.wolframalpha.com/v1/result"
        params = {
            'appid': WOLFRAM_APP_ID,
            'i': query
        }
        
        response = requests.get(url, params=params, timeout=timeout)
        
        if response.status_code == 200:
            result = response.text
            return ToolCall(
                tool_name="wolfram_alpha",
                query=query,
                result=result,
                success=True
            )
        else:
            return ToolCall(
                tool_name="wolfram_alpha",
                query=query,
                result=f"Error {response.status_code}: {response.text}",
                success=False
            )
    
    except Exception as e:
        return ToolCall(
            tool_name="wolfram_alpha",
            query=query,
            result=f"Exception: {str(e)}",
            success=False
        )

print("Wolfram Alpha tool defined")

In [None]:
# Test Wolfram Alpha
test_queries = [
    "integrate x^2",
    "solve x^2 + 5x + 6 = 0",
    "volume of sphere with radius 5"
]

print("Testing Wolfram Alpha API:")
print("="*80)
for query in test_queries:
    result = query_wolfram_alpha(query)
    print(f"\nQuery: {query}")
    print(f"Success: {result.success}")
    print(f"Result: {result.result}")

## 4. RAG + Tool Pipeline

In [None]:
def retrieve_chunks(query: str, k: int = TOP_K) -> tuple:
    """Retrieve from vector DB."""
    results = collection.query(query_texts=[query], n_results=k)
    
    chunks = []
    citations = []
    
    for doc, meta, dist in zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ):
        citation = f"[{meta['filename']}, с. {meta['page_start']}-{meta['page_end']}]"
        header = f"[Джерело {len(chunks)+1}] {citation} | Тип: {meta['content_type']}"
        chunks.append(f"{header}\n{doc}")
        citations.append(citation)
    
    context = "\n\n".join(chunks)
    avg_relevance = float(np.mean([1 - d for d in results['distances'][0]]))
    
    return context, citations, avg_relevance

print("Retrieval function defined")

In [None]:
SYSTEM_PROMPT = """Ти — досвідчений викладач математики для українських учнів 10-11 класів.

Ти маєш доступ до інструментів:
1. **Підручники** (контекст нижче) - для теорії та формул
2. **Wolfram Alpha** - для обчислень та перевірки

Твоє завдання:
- Згенерувати математичну задачу з ПЕРЕВІРЕНИМ розв'язанням
- Використовувати формули з підручників
- ОБОВ'ЯЗКОВО використати Wolfram Alpha для перевірки обчислень
- Надати крок-за-кроком розв'язання українською мовою

Як використовувати Wolfram Alpha:
Напиши: [WOLFRAM: твій запит]
Приклад: [WOLFRAM: volume of sphere with radius 5]

Формат відповіді:
**Задача:** [текст задачі]

**Розв'язання:**
1. [крок 1]
2. [крок 2]
[WOLFRAM: обчислення для перевірки]
3. [крок 3]

**Відповідь:** [фінальна відповідь]

ВАЖЛИВО: 
- Використай контекст з підручників
- ОБОВ'ЯЗКОВО використай Wolfram Alpha хоча б раз
- Відповідай ТІЛЬКИ українською"""

print("System prompt defined")

In [None]:
def extract_wolfram_queries(text: str) -> List[str]:
    """Extract Wolfram Alpha queries from text."""
    pattern = r'\[WOLFRAM:\s*([^\]]+)\]'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return [m.strip() for m in matches]

def generate_with_llm(prompt: str) -> str:
    """Generate text using LLM."""
    messages = [{"role": "user", "content": prompt}]
    formatted = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

print("Helper functions defined")

In [None]:
def rag_tool_generate(
    question: str,
    verbose: bool = False
) -> RAGToolResponse:
    """
    RAG + Tool pipeline:
    1. Retrieve context from textbooks
    2. Generate initial answer with tool calls
    3. Execute Wolfram queries
    4. Refine answer with tool results
    """
    if verbose:
        print(f"\nQuestion: {question}")
        print("  Step 1: Retrieving context...")
    
    # 1. Retrieve
    context, citations, avg_relevance = retrieve_chunks(question)
    
    if verbose:
        print(f"    Retrieved {len(citations)} chunks")
        print("  Step 2: Generating with tool calls...")
    
    # 2. Generate with tool instructions
    prompt = f"{SYSTEM_PROMPT}\n\nКОНТЕКСТ З ПІДРУЧНИКІВ:\n{context}\n\nЗАПИТАННЯ:\n{question}\n\nТВОЯ ВІДПОВІДЬ:"
    answer = generate_with_llm(prompt)
    
    if verbose:
        print(f"    Generated {len(answer)} chars")
        print("  Step 3: Extracting tool calls...")
    
    # 3. Extract and execute Wolfram queries
    wolfram_queries = extract_wolfram_queries(answer)
    tool_calls = []
    
    if verbose and wolfram_queries:
        print(f"    Found {len(wolfram_queries)} Wolfram queries")
    
    for wq in wolfram_queries:
        if verbose:
            print(f"      Querying: {wq}")
        tool_call = query_wolfram_alpha(wq)
        tool_calls.append(tool_call)
        
        # Replace placeholder with result
        placeholder = f"[WOLFRAM: {wq}]"
        replacement = f"[WOLFRAM RESULT: {tool_call.result}]"
        answer = answer.replace(placeholder, replacement)
    
    verified = any(tc.success for tc in tool_calls)
    
    if verbose:
        print(f"    Verified: {verified}")
    
    return RAGToolResponse(
        question=question,
        answer=answer,
        citations=citations,
        tool_calls=tool_calls,
        avg_relevance=avg_relevance,
        answer_length=len(answer),
        verified=verified
    )

print("RAG+Tool pipeline defined")

## 5. Test Questions

In [None]:
from common import TOOL_TEST_QUESTIONS, EVALUATION_DATASET

TEST_QUESTIONS = TOOL_TEST_QUESTIONS
print(f"Test set: {len(TEST_QUESTIONS)} questions")

# Create mapping of questions to expected answers
question_to_expected = {q['input']: q['expected_answer'] for q in EVALUATION_DATASET}
print(f"Expected answers loaded for {len(question_to_expected)} questions")

## 6. Run Experiment

In [None]:
print("="*80)
print("RUNNING RAG + TOOL USE EXPERIMENT")
print("="*80)

responses = []

for i, question in enumerate(TEST_QUESTIONS, 1):
    print(f"\n[{i}/{len(TEST_QUESTIONS)}] {question}")
    print("-"*80)
    
    response = rag_tool_generate(question, verbose=True)
    responses.append(response)
    
    print(f"\nAnswer:\n{response.answer}")
    print(f"\nTool Calls: {len(response.tool_calls)}")
    for tc in response.tool_calls:
        print(f"  - {tc.tool_name}: {tc.query[:50]}... → {tc.success}")
    print(f"\nVerified: {response.verified}")

print(f"\n{'='*80}")
print(f"Completed {len(responses)} RAG+Tool responses")
print("="*80)

## 7. Evaluation

In [None]:
import common

print("Evaluation functions loaded from common.py")

In [None]:
# Evaluate
print("="*80)
print("EVALUATION")
print("="*80)

evaluations = []

for i, response in enumerate(responses, 1):
    expected_answer = question_to_expected.get(response.question, None)
    metrics = common.evaluate_rag_tools(
        response.answer, 
        response.answer_length, 
        response.avg_relevance, 
        len(response.tool_calls) > 0, 
        response.verified,
        expected_answer
    )
    evaluations.append({
        'question': response.question,
        'metrics': metrics,
        'answer_length': response.answer_length,
        'num_tool_calls': len(response.tool_calls)
    })
    
    print(f"\n{i}. {response.question[:50]}...")
    print(f"   Overall: {metrics['overall_score']:.3f} | "
          f"Tools: {metrics['tool_usage']} | "
          f"Verified: {metrics['verified']}")

# Summary
print(f"\n{'='*80}")
print("SUMMARY")
print("="*80)

avg_metrics = {
    'overall_score': np.mean([e['metrics']['overall_score'] for e in evaluations]),
    'retrieval_quality': np.mean([e['metrics']['retrieval_quality'] for e in evaluations]),
    'ukrainian_ratio': np.mean([e['metrics']['ukrainian_ratio'] for e in evaluations]),
    'completeness': np.mean([e['metrics']['completeness'] for e in evaluations]),
    'correctness': np.mean([e['metrics']['correctness'] for e in evaluations]),
    'structure_rate': sum(e['metrics']['has_structure'] for e in evaluations) / len(evaluations),
    'citation_rate': sum(e['metrics']['has_citations'] for e in evaluations) / len(evaluations),
    'tool_usage_rate': sum(e['metrics']['tool_usage'] for e in evaluations) / len(evaluations),
    'verification_rate': sum(e['metrics']['verified'] for e in evaluations) / len(evaluations),
    'avg_tool_calls': np.mean([e['num_tool_calls'] for e in evaluations])
}

for key, value in avg_metrics.items():
    print(f"  {key:20s}: {value:.3f}")


## 8. Save Results

In [None]:
results = {
    'experiment': 'rag_with_tools',
    'description': 'RAG + Wolfram Alpha for verified computations',
    'model': 'gemma-2-9b-it',
    'tools': ['wolfram_alpha'],
    'wolfram_mode': 'DEMO' if WOLFRAM_APP_ID == 'DEMO' else 'API',
    'avg_metrics': avg_metrics,
    'responses': [r.to_dict() for r in responses],
    'evaluations': evaluations
}

with open(OUTPUT_DIR / 'results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to {OUTPUT_DIR}")
print("\n" + "="*80)
print("EXPERIMENT 4 COMPLETE")
print("="*80)
print(f"\nOverall Score: {avg_metrics['overall_score']:.3f}")
print(f"Tool Usage Rate: {avg_metrics['tool_usage_rate']:.3f}")
print(f"Verification Rate: {avg_metrics['verification_rate']:.3f}")
print(f"Avg Tool Calls: {avg_metrics['avg_tool_calls']:.1f}")