# Experiment 2: Basic RAG

In [None]:
# Setup
import sys
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import chromadb
from chromadb.utils import embedding_functions

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

sys.path.append('..')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports loaded")

In [None]:
# Configuration
DB_PATH = Path("../data/vector_db")
MODEL_PATH = Path("/home/sskaplun/study/genAI/kaggle/models/gemma-2-9b-it")
OUTPUT_DIR = Path("../evaluation/experiment_02")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

COLLECTION_NAME = "ukrainian_math"
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# RAG parameters
TOP_K = 5
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 512

print(f"Database: {DB_PATH}")
print(f"Model: {MODEL_PATH}")
print(f"Top-K: {TOP_K}")
print(f"CUDA: {torch.cuda.is_available()}")

In [None]:
@dataclass
class RetrievedChunk:
    text: str
    content_type: str
    confidence: float
    filename: str
    page_start: int
    page_end: int
    distance: float
    relevance: float
    citation: str

@dataclass
class RAGResponse:
    question: str
    answer: str
    citations: List[str]
    retrieved_chunks: List[RetrievedChunk]
    avg_relevance: float
    answer_length: int
    
    def to_dict(self):
        return {
            'question': self.question,
            'answer': self.answer,
            'citations': self.citations,
            'avg_relevance': self.avg_relevance,
            'answer_length': self.answer_length,
            'num_chunks': len(self.retrieved_chunks)
        }

print("Dataclasses defined")

## 1. Load Vector Database

In [None]:
print("="*80)
print("LOADING VECTOR DATABASE")
print("="*80)

client = chromadb.PersistentClient(path=str(DB_PATH))

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL
)

collection = client.get_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_function
)

count = collection.count()
metadata = collection.metadata

print(f"\nVector database loaded")
print(f"  Collection: {COLLECTION_NAME}")
print(f"  Total chunks: {count:,}")
print(f"  Metadata: {metadata}")

## 2. Load LLM

In [None]:
print("="*80)
print("LOADING LLM")
print("="*80)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH))
print("Tokenizer loaded")

model = AutoModelForCausalLM.from_pretrained(
    str(MODEL_PATH),
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)
print("Model loaded")

if torch.cuda.is_available():
    memory = torch.cuda.memory_allocated() / 1024**3
    print(f"GPU Memory: {memory:.2f} GB")

## 3. RAG Pipeline

In [None]:
SYSTEM_PROMPT = """Ти — досвідчений викладач математики для українських учнів 10-11 класів.

Твоє завдання:
- Згенерувати математичну задачу з розв'язанням на основі ТІЛЬКИ наданого контексту
- Використовувати ТІЛЬКИ українську мову
- Використовувати математичну термінологію з підручників
- Обов'язково посилатися на джерела (наприклад: "За формулою [Джерело 1]...")
- Надати чітке покрокове розв'язання

Формат відповіді:
**Задача:** [текст задачі на основі контексту]

**Розв'язання:**
[покрокове рішення з посиланнями на джерела]

**Відповідь:** [фінальна відповідь]

ВАЖЛИВО: Використовуй ТІЛЬКИ інформацію з наданого контексту!"""

print("System prompt defined")

In [None]:
def retrieve_chunks(query: str, k: int = TOP_K) -> List[RetrievedChunk]:
    """Retrieve relevant chunks from vector database."""
    results = collection.query(
        query_texts=[query],
        n_results=k
    )
    
    chunks = []
    for doc, meta, dist in zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ):
        chunk = RetrievedChunk(
            text=doc,
            content_type=meta['content_type'],
            confidence=meta['confidence'],
            filename=meta['filename'],
            page_start=meta['page_start'],
            page_end=meta['page_end'],
            distance=dist,
            relevance=1 - dist,
            citation=f"[{meta['filename']}, с. {meta['page_start']}-{meta['page_end']}]"
        )
        chunks.append(chunk)
    
    return chunks

print("Retrieval function defined")

In [None]:
def format_context(chunks: List[RetrievedChunk]) -> str:
    """Format chunks into context for LLM."""
    context_parts = []
    
    for i, chunk in enumerate(chunks, 1):
        header = f"[Джерело {i}] {chunk.citation} | Тип: {chunk.content_type}"
        context_parts.append(f"{header}\n{chunk.text}")
    
    return "\n\n".join(context_parts)

print("Context formatter defined")

In [None]:
def generate_with_context(
    prompt: str,
    temperature: float = TEMPERATURE,
    max_new_tokens: int = MAX_NEW_TOKENS
) -> str:
    """Generate answer using LLM."""
    messages = [{"role": "user", "content": prompt}]
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=temperature > 0,
            pad_token_id=tokenizer.eos_token_id
        )
    
    answer = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()
    
    return answer

print("Generation function defined")

In [None]:
def rag_generate(
    question: str,
    k: int = TOP_K,
    temperature: float = TEMPERATURE,
    verbose: bool = False
) -> RAGResponse:
    """Complete RAG pipeline: retrieve, format, generate."""
    if verbose:
        print(f"Retrieving {k} chunks for: {question}")
    
    chunks = retrieve_chunks(question, k=k)
    
    if verbose:
        avg_rel = np.mean([c.relevance for c in chunks])
        print(f"  Avg relevance: {avg_rel:.3f}")
    
    context = format_context(chunks)
    
    prompt = f"{SYSTEM_PROMPT}\n\nКОНТЕКСТ З ПІДРУЧНИКІВ:\n{context}\n\nЗАПИТАННЯ:\n{question}\n\nТВОЯ ВІДПОВІДЬ:"
    
    if verbose:
        print(f"  Prompt length: {len(prompt)} chars")
        print("  Generating...")
    
    answer = generate_with_context(prompt, temperature=temperature)
    
    if verbose:
        print(f"  Answer: {len(answer)} chars")
    
    return RAGResponse(
        question=question,
        answer=answer,
        citations=[c.citation for c in chunks],
        retrieved_chunks=chunks,
        avg_relevance=float(np.mean([c.relevance for c in chunks])),
        answer_length=len(answer)
    )

print("RAG pipeline defined")

## 4. Test Questions

In [None]:
from common import STANDARD_TEST_QUESTIONS, EVALUATION_DATASET

TEST_QUESTIONS = STANDARD_TEST_QUESTIONS
print(f"Test set: {len(TEST_QUESTIONS)} questions")

# Create mapping of questions to expected answers
question_to_expected = {q['input']: q['expected_answer'] for q in EVALUATION_DATASET}
print(f"Expected answers loaded for {len(question_to_expected)} questions")

## 5. Run RAG Experiment

In [None]:
print("="*80)
print("RUNNING BASIC RAG EXPERIMENT")
print("="*80)

responses = []

for i, question in enumerate(TEST_QUESTIONS, 1):
    print(f"\n[{i}/{len(TEST_QUESTIONS)}] {question}")
    print("-"*80)
    
    response = rag_generate(question, verbose=True)
    responses.append(response)
    
    print(f"\nAnswer:\n{response.answer}")
    print(f"\nCitations: {len(response.citations)}")
    for j, citation in enumerate(response.citations[:3], 1):
        print(f"  {j}. {citation}")

print(f"\n{'='*80}")
print(f"Generated {len(responses)} RAG responses")
print("="*80)

## 6. Evaluation

In [None]:
import common

print("Evaluation functions loaded from common.py")

In [None]:
# Evaluate
print("="*80)
print("EVALUATION")
print("="*80)

evaluations = []

for i, response in enumerate(responses, 1):
    # Get expected answer for this question
    expected = question_to_expected.get(response.question, None)
    
    # Evaluate with expected answer for correctness
    metrics = common.evaluate_basic_rag(
        response.answer,
        response.answer_length,
        response.avg_relevance,
        expected_answer=expected
    )
    
    evaluations.append({
        'question': response.question,
        'metrics': metrics,
        'answer_length': response.answer_length,
        'avg_relevance': response.avg_relevance,
        'expected_answer': expected
    })
    
    print(f"\n{i}. {response.question[:50]}...")
    print(f"   Overall: {metrics['overall_score']:.3f} | "
          f"Retrieval: {metrics['retrieval_quality']:.3f} | "
          f"Correctness: {metrics['correctness']:.3f} | "
          f"Ukrainian: {metrics['ukrainian_ratio']:.3f}")

# Summary
print(f"\n{'='*80}")
print("SUMMARY")
print("="*80)

avg_metrics = {
    'overall_score': np.mean([e['metrics']['overall_score'] for e in evaluations]),
    'retrieval_quality': np.mean([e['metrics']['retrieval_quality'] for e in evaluations]),
    'ukrainian_ratio': np.mean([e['metrics']['ukrainian_ratio'] for e in evaluations]),
    'completeness': np.mean([e['metrics']['completeness'] for e in evaluations]),
    'structure_rate': sum(e['metrics']['has_structure'] for e in evaluations) / len(evaluations),
    'citation_rate': sum(e['metrics']['has_citations'] for e in evaluations) / len(evaluations),
    'correctness': np.mean([e['metrics']['correctness'] for e in evaluations])
}

for key, value in avg_metrics.items():
    print(f"  {key:20s}: {value:.3f}")

## 7. Visualization

In [None]:
# Create DataFrame
df = pd.DataFrame([
    {
        'question_num': i+1,
        'question': e['question'][:40] + '...',
        'overall': e['metrics']['overall_score'],
        'retrieval': e['metrics']['retrieval_quality'],
        'ukrainian': e['metrics']['ukrainian_ratio'],
        'citations': int(e['metrics']['has_citations']),
        'length': e['answer_length']
    }
    for i, e in enumerate(evaluations)
])

print(df.to_string())

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

ax = axes[0]
bars = ax.bar(df['question_num'], df['overall'], color='green', alpha=0.7, edgecolor='black')
ax.axhline(y=avg_metrics['overall_score'], color='red', linestyle='--',
           label=f"Avg: {avg_metrics['overall_score']:.3f}")
ax.set_xlabel('Question')
ax.set_ylabel('Overall Score')
ax.set_title('Basic RAG: Overall Scores', fontweight='bold')
ax.set_ylim(0, 1.0)
ax.legend()
ax.grid(axis='y', alpha=0.3)

ax = axes[1]
scatter = ax.scatter(df['retrieval'], df['overall'], s=100, alpha=0.7, 
                     c=df['ukrainian'], cmap='RdYlGn', edgecolors='black')
ax.set_xlabel('Retrieval Quality')
ax.set_ylabel('Overall Score')
ax.set_title('Retrieval vs Overall Quality', fontweight='bold')
ax.grid(alpha=0.3)
plt.colorbar(scatter, ax=ax, label='Ukrainian Ratio')

ax = axes[2]
metrics_data = [
    avg_metrics['retrieval_quality'],
    avg_metrics['ukrainian_ratio'],
    avg_metrics['citation_rate'],
    avg_metrics['structure_rate']
]
labels = ['Retrieval', 'Ukrainian', 'Citations', 'Structure']
colors = ['#4CAF50', '#2196F3', '#FF9800', '#9C27B0']
bars = ax.bar(labels, metrics_data, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Score')
ax.set_title('Average Metrics', fontweight='bold')
ax.set_ylim(0, 1.0)
ax.grid(axis='y', alpha=0.3)

for bar, val in zip(bars, metrics_data):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'rag_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("Visualization saved")

## 8. Save Results

In [None]:
results = {
    'experiment': 'basic_rag',
    'description': 'Vanilla RAG with semantic search and context injection',
    'model': 'gemma-2-9b-it',
    'embedding_model': EMBEDDING_MODEL,
    'top_k': TOP_K,
    'temperature': TEMPERATURE,
    'avg_metrics': avg_metrics,
    'responses': [r.to_dict() for r in responses],
    'evaluations': evaluations
}

with open(OUTPUT_DIR / 'results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

df.to_csv(OUTPUT_DIR / 'evaluation.csv', index=False)

print(f"Results saved to {OUTPUT_DIR}")
print("\n" + "="*80)
print("EXPERIMENT 2 COMPLETE")
print("="*80)
print(f"\nOverall Score: {avg_metrics['overall_score']:.3f}")
print(f"Retrieval Quality: {avg_metrics['retrieval_quality']:.3f}")
print(f"Correctness: {avg_metrics['correctness']:.3f}")
print(f"Ukrainian Ratio: {avg_metrics['ukrainian_ratio']:.3f}")
print(f"Citation Rate: {avg_metrics['citation_rate']:.3f}")