# Mini Czech Benchmark

This notebook evaluates models by running a text generation pipeline over four datasets: `agree`, `czech_news`, `klokanek` and `ctkfacts`, either *full* data or *mini* (random 200 rows) or *tiny* ([selected](https://arxiv.org/abs/2402.14992) 100 rows).

It can be run through [paperpile](https://github.com/nteract/papermill) as follows:

```
  model="meta-llama/Llama-3.2-1B-Instruct"  # or other HuggingFace model
  hf_token="** Your HuggingFace read access token **"
  papermill minicz_bench.ipynb output.ipynb -p MODEL "$model" -p HF_TOKEN "$hf_token" -p DATA_SIZE "mini" -p OUTPUT_DIR "" -k python3
```

[MiniCzechBenchmark](https://github.com/simecek/MiniCzechBenchmark) is a small subset selected from [CzechBench](https://gitlab.com/jirkoada/czech-bench) benchmark suited for fast model assessment.

In [None]:
# papermill parameters

MODEL = 'mistralai/Mistral-7B-Instruct-v0.3' # hf hub model, e.g. mistralai/Mistral-7B-Instruct-v0.3

MESSAGES  = 'simplemessages' # Choose one of: 'simplemessages' or 'justprompt' or 'useronly'

DATA_SIZE = 'mini' # Choose one of: 'full' or 'mini' or 'tiny'

OUTPUT_DIR = 'bench_runs'  # folder to export metrics and outputs (None or '' to avoid saving)

PIPELINE_TYPE = 'noSampling'  # type of text-generation pipeline; currently only 'noSampling' supported

HF_TOKEN = ''  # HF token needed to access gated models

## Text-generation Pipeline

In [None]:
from huggingface_hub import login
login(HF_TOKEN)

In [None]:
import bitsandbytes
import torch
from transformers import pipeline
from datasets import load_dataset
import pandas as pd

MAX_NEW_TOKENS = {
    'agree': 2,
    'czech_news': 2,
    'klokanek': 2,
    'ctkfacts': 2
}

def message_function(message_strategy, user_prompts, system_prompts):
    if message_strategy == 'simplemessages':
        messages = [[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ] for system_prompt, user_prompt  in zip(system_prompts, user_prompts)]
    elif message_strategy == 'justprompt':
        messages = [f"{system_prompt}\n\n{user_prompt}" for system_prompt, user_prompt in zip(system_prompts, user_prompts)]
    elif message_strategy == 'useronly':
        messages = [[
            {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"},
        ] for system_prompt, user_prompt  in zip(system_prompts, user_prompts)]
    else:
        raise('Message strategy not implemeted')
        
    return messages

def cleaning_function(raw_outputs):
    return [x[0]['generated_text'][:3].strip().replace(")", "").replace(".", "") for x in raw_outputs]

if DATA_SIZE == "full":
    DATASETS = {
        'agree': 'simecek/small_agree',
        'czech_news': 'simecek/small_czech_news',
        'klokanek': 'simecek/small_klokanek',
        'ctkfacts': 'simecek/small_ctkfacts'}
elif  DATA_SIZE == "mini":
    DATASETS = {
        'agree': 'simecek/mini_agree',
        'czech_news': 'simecek/mini_czech_news',
        'klokanek': 'simecek/mini_klokanek',
        'ctkfacts': 'simecek/mini_ctkfacts'}
elif  DATA_SIZE == "tiny":
    raise(f"Data size {DATA_SIZE} not implemeted")
else:
    raise(f"Data size {DATA_SIZE} not implemeted")

In [None]:
# define the pipeline

pipe = pipeline(
    "text-generation", 
    model=MODEL, 
    model_kwargs={"torch_dtype": torch.bfloat16}, 
    device_map="auto",
    do_sample=False,
    temperature=0,
    pad_token_id=2  # TODO: Instead of hard-coded value, find better way to set pad_token_id to eos_token_id 
)

# Explicitly set pad_token_id to eos_token_id to prevent the warning
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

## Evaluation

In [None]:
raw_outputs = {}  # raw outputs from llm
clean_outputs = {}  # after cleaning
dfs = {}  # dataframe comparing clean_outputs to correct answers
metrics = {}  # overall summaries for each dataset

### AGREE

In [None]:
dataset_name = 'agree'

dt = load_dataset(DATASETS[dataset_name])

In [None]:
messages = message_function(MESSAGES, dt['train']['user_prompt'], dt['train']['system_prompt'])

In [None]:
tmp = pipe(messages[:5], return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])
tmp

In [None]:
raw_outputs[dataset_name] = pipe(messages, return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])

In [None]:
clean_outputs[dataset_name] = cleaning_function(raw_outputs[dataset_name])

In [None]:
dfs[dataset_name] = pd.DataFrame({
    'correct_answer_minus_1': dt['train']['answer_idx'],
    'answer': clean_outputs[dataset_name],
})

dfs[dataset_name]

In [None]:
dfs[dataset_name]['valid'] = dfs[dataset_name].answer.isin(['1', '2', '3', '4', '5'])
dfs[dataset_name]['correct'] = [(x in ['1', '2', '3', '4', '5']) and int(x) == int(y)+1 for x,y in zip(dfs[dataset_name].answer, dfs[dataset_name].correct_answer_minus_1)]

In [None]:
# correct answers vs valid answers
metrics[dataset_name] = (dfs[dataset_name]['valid'].mean().item(), dfs[dataset_name].correct.mean().item())
metrics[dataset_name]

### CZECH NEWS

In [None]:
dataset_name = 'czech_news'

dt = load_dataset(DATASETS[dataset_name])

In [None]:
messages = message_function(MESSAGES, dt['train']['user_prompt'], dt['train']['system_prompt'])

In [None]:
tmp = pipe(messages[:5], return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])
tmp

In [None]:
raw_outputs[dataset_name] = pipe(messages, return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])

In [None]:
clean_outputs[dataset_name] = cleaning_function(raw_outputs[dataset_name])

In [None]:
dfs[dataset_name] = pd.DataFrame({
    'correct_answer': dt['train']['category'],
    'answer': clean_outputs[dataset_name],
})

dfs[dataset_name]

In [None]:
dfs[dataset_name]['valid'] = dfs[dataset_name].answer.isin(['1', '2', '3', '4', '5'])
dfs[dataset_name]['correct'] = dfs[dataset_name].answer.apply(str) == dfs[dataset_name].correct_answer.apply(str)

In [None]:
# correct answers vs valid answers
metrics[dataset_name] = (dfs[dataset_name]['valid'].mean().item(), dfs[dataset_name].correct.mean().item())
metrics[dataset_name]

### KLOKANEK

In [None]:
dataset_name = 'klokanek'

dt = load_dataset(DATASETS[dataset_name])

In [None]:
messages = message_function(MESSAGES, dt['train']['user_prompt'], dt['train']['system_prompt'])


In [None]:
tmp = pipe(messages[:5], return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])
tmp

In [None]:
raw_outputs[dataset_name] = pipe(messages, return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])

In [None]:
clean_outputs[dataset_name] = cleaning_function(raw_outputs[dataset_name])

In [None]:
dfs[dataset_name] = pd.DataFrame({
    'correct_answer': dt['train']['correct_answer'],
    'answer': clean_outputs[dataset_name],
})

dfs[dataset_name]

In [None]:
dfs[dataset_name]['valid'] = dfs[dataset_name].answer.str.lower().isin(['a', 'b', 'c', 'd', 'e'])
dfs[dataset_name]['correct'] = dfs[dataset_name].answer.str.lower() == dfs[dataset_name].correct_answer.str.lower()

In [None]:
# correct answers vs valid answers
metrics[dataset_name] = (dfs[dataset_name]['valid'].mean().item(), dfs[dataset_name].correct.mean().item())
metrics[dataset_name]

### CTK Facts

In [None]:
dataset_name = 'ctkfacts'

dt = load_dataset(DATASETS[dataset_name])

In [None]:
messages = message_function(MESSAGES, dt['train']['user_prompt'], dt['train']['system_prompt'])

In [None]:
tmp = pipe(messages[:5], return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])
tmp

In [None]:
raw_outputs[dataset_name] = pipe(messages, return_full_text=False, max_new_tokens=MAX_NEW_TOKENS[dataset_name])

In [None]:
clean_outputs[dataset_name] = cleaning_function(raw_outputs[dataset_name])

In [None]:
dfs[dataset_name] = pd.DataFrame({
    'correct_answer': dt['train']['label'],
    'answer': clean_outputs[dataset_name],
})

dfs[dataset_name]

In [None]:
dfs[dataset_name]['valid'] = dfs[dataset_name].answer.isin(['0', '1', '2'])
dfs[dataset_name]['correct'] = dfs[dataset_name].answer == dfs[dataset_name].correct_answer.apply(str)

In [None]:
# correct answers vs valid answers
metrics[dataset_name] = (dfs[dataset_name]['valid'].mean().item(), dfs[dataset_name].correct.mean().item())
metrics[dataset_name]

## Metrics & Export

In [None]:
metrics

In [None]:
RUN_NAME = f"{MODEL.split('/')[1]}_{DATA_SIZE}_{MESSAGES}_{PIPELINE_TYPE}"
RUN_NAME

In [None]:
import pickle
import gzip

objects_to_save = {
    'model': MODEL,
    'datasets': DATASETS,
    'raw_outputs': raw_outputs,
    'dfs': dfs,
    'metrics': metrics
}

if OUTPUT_DIR:
    with gzip.open(f"{OUTPUT_DIR}/{RUN_NAME}.pkl.gz", "wb") as file:
        pickle.dump(objects_to_save, file)

In [None]:
#with gzip.open(f"{OUTPUT_DIR}/{RUN_NAME}.pkl.gz", "rb") as file:
#    loaded_objects = pickle.load(file)