# Talk to LLMs & evaluate them on a simple benchmark

Adapted from https://github.com/simecek/MiniCzechBenchmark/blob/main/minicz_bench.ipynb

In [None]:
# parameters

MODEL = 'unsloth/gemma-2-2b' # hf hub model, e.g. mistralai/Mistral-7B-Instruct-v0.3

HF_TOKEN = ''  # HF token needed to access gated models

## Text-generation Pipeline

In [None]:
#from huggingface_hub import login
#login(HF_TOKEN)

In [None]:
import torch
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset

tok = AutoTokenizer.from_pretrained(MODEL)  # needed by granite models

pipe = pipeline(
    "text-generation", 
    model=MODEL,
    tokenizer=tok,
    model_kwargs={"torch_dtype": torch.bfloat16}, 
    device_map="auto",
    do_sample=True,
    temperature=0.7,
    pad_token_id=tok.eos_token_id,
)

# Explicitly set pad_token_id to eos_token_id to prevent the warning
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

In [None]:
output = pipe("Řekni mi vtip", return_full_text=True, max_new_tokens=50)

output[0]['generated_text']

## Evaluation

In [None]:
MAX_NEW_TOKENS = {
    'agree': 2,
    'czech_news': 2,
    'klokanek': 2,
    'ctkfacts': 2
}

def message_function(user_prompts, system_prompts):
    messages = [f"{system_prompt}\n\n{user_prompt}" for system_prompt, user_prompt in zip(system_prompts, user_prompts)]
    return messages

def cleaning_function(raw_outputs):
    return [x[0]['generated_text'][:3].strip().replace(")", "").replace(".", "") for x in raw_outputs]

DATASETS = {
        'agree': 'simecek/mini_agree',
        'czech_news': 'simecek/mini_czech_news',
        'klokanek': 'simecek/mini_klokanek',
        'ctkfacts': 'simecek/mini_ctkfacts'}

In [None]:
raw_outputs = {}  # raw outputs from llm
clean_outputs = {}  # after cleaning
dfs = {}  # dataframe comparing clean_outputs to correct answers
metrics = {}  # overall summaries for each dataset

### Czech News benchmark - 200 questions

In [None]:
dataset_name = 'czech_news'

dt = load_dataset(DATASETS[dataset_name])

In [None]:
print(dt['train']['system_prompt'][0][:300])

In [None]:
print(dt['train']['user_prompt'][0])

In [None]:
print(dt['train']['category'][0])

In [None]:
messages = message_function(dt['train']['user_prompt'], dt['train']['system_prompt'])

In [None]:
tmp = pipe(messages[:5], return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])
tmp

In [None]:
raw_outputs[dataset_name] = pipe(messages, return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])

In [None]:
clean_outputs[dataset_name] = cleaning_function(raw_outputs[dataset_name])

In [None]:
import pandas as pd

dfs[dataset_name] = pd.DataFrame({
    'correct_answer': dt['train']['category'],
    'answer': clean_outputs[dataset_name],
})

dfs[dataset_name]

In [None]:
dfs[dataset_name]['valid'] = dfs[dataset_name].answer.isin(['1', '2', '3', '4', '5'])
dfs[dataset_name]['correct'] = dfs[dataset_name].answer.apply(str) == dfs[dataset_name].correct_answer.apply(str)

In [None]:
# correct answers vs valid answers
metrics[dataset_name] = (dfs[dataset_name]['valid'].mean().item(), dfs[dataset_name].correct.mean().item())
metrics[dataset_name]

Now try to improve the results:
* use `unsloth/gemma-2-2b`
* remove sampling and set `temperature` as `0`