# Experiment 1: Baseline (No RAG)

In [None]:
# Setup
import sys
import json
from pathlib import Path
from typing import Dict, List, Any
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

sys.path.append('..')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports loaded")

In [None]:
# Configuration
MODEL_PATH = Path("/home/sskaplun/study/genAI/kaggle/models/gemma-2-9b-it")
OUTPUT_DIR = Path("../evaluation/experiment_01")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Generation parameters
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 512

print(f"Model: {MODEL_PATH}")
print(f"Output: {OUTPUT_DIR}")
print(f"CUDA Available: {torch.cuda.is_available()}")

In [None]:
@dataclass
class BaselineResponse:
    question: str
    answer: str
    temperature: float
    answer_length: int
    
    def to_dict(self):
        return asdict(self)

print("Dataclass defined")

## 1. Load Model

In [None]:
print("="*80)
print("LOADING GEMMA-2-9B-INSTRUCT")
print("="*80)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH))
print("Tokenizer loaded")

model = AutoModelForCausalLM.from_pretrained(
    str(MODEL_PATH),
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)
print("Model loaded")

if torch.cuda.is_available():
    memory_used = torch.cuda.memory_allocated() / 1024**3
    print(f"\nGPU Memory: {memory_used:.2f} GB")

## 2. Define System Prompt & Generation

In [None]:
SYSTEM_PROMPT = """Ти — досвідчений викладач математики для українських учнів 10-11 класів.

Твоє завдання:
- Згенерувати математичну задачу з рішенням
- Використовувати ТІЛЬКИ українську мову
- Надати чітке пояснення та крок-за-кроком розв'язання
- Використовувати коректну українську математичну термінологію

Формат відповіді:
**Задача:** [текст задачі]

**Розв'язання:**
[покрокове рішення]

**Відповідь:** [фінальна відповідь]"""

print("System prompt defined")

In [None]:
def generate_baseline(
    question: str,
    temperature: float = TEMPERATURE,
    max_new_tokens: int = MAX_NEW_TOKENS
) -> BaselineResponse:
    """Generate answer using LLM only (no RAG context)."""
    prompt = f"{SYSTEM_PROMPT}\n\nЗАПИТАННЯ:\n{question}\n\nТВОЯ ВІДПОВІДЬ:"
    
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=temperature > 0,
            pad_token_id=tokenizer.eos_token_id
        )
    
    answer = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()
    
    return BaselineResponse(
        question=question,
        answer=answer,
        temperature=temperature,
        answer_length=len(answer)
    )

print("Generation function defined")

## 3. Test Questions

In [None]:
from common import STANDARD_TEST_QUESTIONS, EVALUATION_DATASET

TEST_QUESTIONS = STANDARD_TEST_QUESTIONS
print(f"Test set: {len(TEST_QUESTIONS)} questions")

# Create mapping of questions to expected answers
question_to_expected = {q['input']: q['expected_answer'] for q in EVALUATION_DATASET}
print(f"Expected answers loaded for {len(question_to_expected)} questions")

## 4. Run Baseline Experiment

In [None]:
print("="*80)
print("RUNNING BASELINE EXPERIMENT (NO RAG)")
print("="*80)

responses = []

for i, question in enumerate(TEST_QUESTIONS, 1):
    print(f"\n[{i}/{len(TEST_QUESTIONS)}] {question}")
    print("-"*80)
    
    response = generate_baseline(question)
    responses.append(response)
    
    print(f"\n{response.answer}")
    print(f"\nLength: {response.answer_length} chars")

print(f"\n{'='*80}")
print(f"Generated {len(responses)} responses")
print("="*80)

## 5. Evaluation

In [None]:
import common

print("Evaluation functions loaded from common.py")

In [None]:
# Evaluate all responses
print("="*80)
print("EVALUATION")
print("="*80)

evaluations = []

for i, response in enumerate(responses, 1):
    # Get expected answer for this question
    expected = question_to_expected.get(response.question, None)
    
    # Evaluate with expected answer for correctness
    metrics = common.evaluate_baseline(
        response.answer, 
        response.answer_length,
        expected_answer=expected
    )
    
    evaluations.append({
        'question': response.question,
        'metrics': metrics,
        'answer_length': response.answer_length,
        'expected_answer': expected
    })
    
    print(f"\n{i}. {response.question[:50]}...")
    print(f"   Overall: {metrics['overall_score']:.3f} | "
          f"Ukrainian: {metrics['ukrainian_ratio']:.3f} | "
          f"Correctness: {metrics['correctness']:.3f} | "
          f"Structure: {metrics['has_structure']}")

# Summary statistics
print(f"\n{'='*80}")
print("SUMMARY")
print("="*80)

avg_metrics = {
    'overall_score': np.mean([e['metrics']['overall_score'] for e in evaluations]),
    'ukrainian_ratio': np.mean([e['metrics']['ukrainian_ratio'] for e in evaluations]),
    'completeness': np.mean([e['metrics']['completeness'] for e in evaluations]),
    'structure_rate': sum(e['metrics']['has_structure'] for e in evaluations) / len(evaluations),
    'correctness': np.mean([e['metrics']['correctness'] for e in evaluations])
}

for key, value in avg_metrics.items():
    print(f"  {key:20s}: {value:.3f}")

## 6. Visualization

In [None]:
# Create dataframe
df = pd.DataFrame([
    {
        'question_num': i+1,
        'question': e['question'][:40] + '...',
        'overall': e['metrics']['overall_score'],
        'ukrainian': e['metrics']['ukrainian_ratio'],
        'structure': int(e['metrics']['has_structure']),
        'length': e['answer_length']
    }
    for i, e in enumerate(evaluations)
])

print(df.to_string())

In [None]:
common.create_metrics_visualization(
    evaluations=evaluations,
    avg_metrics=avg_metrics,
    output_path=OUTPUT_DIR / 'baseline_metrics.png',
    experiment_name='Baseline',
    metric_names=['ukrainian_ratio', 'completeness', 'correctness', 'structure_rate']
)

plt.show()

## 7. Save Results

In [None]:
results = {
    'experiment': 'baseline_no_rag',
    'description': 'LLM-only generation without retrieval context',
    'model': 'gemma-2-9b-it',
    'temperature': TEMPERATURE,
    'avg_metrics': avg_metrics,
    'responses': [r.to_dict() for r in responses],
    'evaluations': evaluations
}

with open(OUTPUT_DIR / 'results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

df.to_csv(OUTPUT_DIR / 'evaluation.csv', index=False)

print(f"Results saved to {OUTPUT_DIR}")
print("\n" + "="*80)
print("EXPERIMENT 1 COMPLETE")
print("="*80)
print(f"\nOverall Score: {avg_metrics['overall_score']:.3f}")
print(f"Ukrainian Ratio: {avg_metrics['ukrainian_ratio']:.3f}")
print(f"Correctness: {avg_metrics['correctness']:.3f}")
print(f"Structure Rate: {avg_metrics['structure_rate']:.3f}")