# Model Eval
This notebook loads finance eval datasets and runs a model on each entry to gather a quantitative evaluation.

In [1]:
import torch
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset
from peft import PeftModel
from transformers import BatchEncoding, AutoTokenizer
from transformers.models.llama.modeling_llama import LlamaForCausalLM


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16").to(device)
model = base_model

In [3]:
ckpt_path = Path(r"D:/models/basic-Llama-3_2-LoRA") / "checkpoint-12660"
lora_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16")
lora_model.eval()

model = lora_model

## Tokenizer
Using eos token as padding, can batch test cases to improve efficiency of tests

In [5]:
def tokenize(text: str | list[str]) -> BatchEncoding:
    return tokenizer(text,
                     truncation=True,
                     padding=True,
                     return_tensors="pt")

def decode(tokens: torch.Tensor) -> str:
    return tokenizer.decode(tokens, skip_special_tokens=True)

## Dataset

In [18]:
# subsets = ["FiQA_SA", "Headline", "ConvFinQA", "FPB", "NER"]

eval_dataset = load_dataset("AdaptLLM/finance-tasks", "Headline")
testset = eval_dataset["test"]
num_samples = len(testset)

def get_batch(testset, offset: int, n: int) -> dict[str, list[str]]:
    test_batch = testset[offset:offset+n]
    return test_batch

testset

Dataset({
    features: ['id', 'input', 'options', 'gold_index', 'class_id'],
    num_rows: 20547
})

In [7]:
batch = get_batch(testset, 0, 5)
text, options, ids_ = batch["input"], batch["options"], batch["gold_index"]

for sample, opts, class_ in zip(text, options, ids_):
    print(f"answer={opts[class_]}, text={sample[:50]}")

answer=Yes, text=Headline: "Gold falls to Rs 30,800; silver down at
answer=No, text=Headline: february gold rallies to intraday high o
answer=No, text=Please answer a question about the following headl
answer=Yes, text=Read this headline: "gold closes lower as dollar r
answer=No, text=gold adds $42, or 2.4%, to trade at $1,833.30/oz
Q


## Evaluation

In [8]:
model.eval()

batch = get_batch(testset, 0, 5)

## tokenize batch
tokenized_input = tokenize(batch["input"])
input_ids = tokenized_input["input_ids"].to(model.device)
attn_mask = tokenized_input["attention_mask"]

## indices of the generated tokens for test samples in batch
gen_idx = attn_mask.sum(dim=1).long() - 1

## forward pass of LLM
logits = (model
            .forward(input_ids=input_ids, attention_mask=attn_mask.to(model.device))
            .logits
            .cpu()
         ) # (B,T,C)

## get generated output for each test sample
gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
gen_tokens = torch.argmax(gen_logits, dim=-1)

## decode generated tokens
decode(gen_tokens)

' No No No No No'

In [9]:
num_correct = 0
for tok, opts, class_id in zip(gen_tokens, batch["options"], batch["gold_index"]):
    pred_opt = tokenizer.decode(tok).strip(" ").lower()
    num_correct += (pred_opt == opts[class_id].lower())
num_correct

3

In [10]:
def eval_on_batch(model: LlamaForCausalLM, test_batch: dict) -> int:
    """Run eval on model for a given test batch, returning the number of correct answers"""

    tokenized_input = tokenize(test_batch["input"])
    input_ids = tokenized_input["input_ids"].to(model.device)
    attn_mask = tokenized_input["attention_mask"]

    gen_idx = attn_mask.sum(dim=1).long() - 1
    logits = (model
                .forward(input_ids=input_ids, attention_mask=attn_mask.to(model.device))
                .logits
                .cpu()
            )

    gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
    gen_tokens = torch.argmax(gen_logits, dim=-1).tolist()
    
    num_correct = 0
    for tok, opts, class_id in zip(gen_tokens, test_batch["options"], test_batch["gold_index"]):
        pred_opt = tokenizer.decode(tok).strip(" ").lower()
        num_correct += (pred_opt == opts[class_id].lower())
    
    return num_correct

In [23]:
def evaluate(model: LlamaForCausalLM, testset, batch_size = 4, desc: str = "") -> float:
    model.eval()
    n = len(testset)
    n_iters = n // batch_size

    num_total = 0
    num_correct = 0
    prog_bar = tqdm(range(n_iters), desc=desc)
    for k in prog_bar:
        batch = get_batch(testset, k*batch_size, batch_size)
        num_correct += eval_on_batch(model, batch)
        num_total += len(batch)
        prog_bar.set_description(f"{desc} | {100*num_correct/num_total:.3f}")
    
    if n % batch_size:
        batch = get_batch(testset, n_iters*batch_size, n)
        num_correct += eval_on_batch(model, batch)
    
    return num_correct / n

In [21]:
eval_dataset = load_dataset("AdaptLLM/finance-tasks", "Headline")
evaluate(model, eval_dataset["test"])

  0%|          | 0/5136 [00:00<?, ?it/s]


KeyboardInterrupt: 

## Eval all subsets

In [24]:
from datetime import datetime

now_time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
scores_file = open(f"scores-{now_time}.txt", "w")

subsets = ["FiQA_SA", "Headline", "ConvFinQA", "FPB", "NER"]
for subset in subsets:
    eval_dataset = load_dataset("AdaptLLM/finance-tasks", subset)
    eval_score = evaluate(model, eval_dataset["test"], desc=subset)

    out_txt = f"{subset} = {eval_score*100:.2f}"

    print(out_txt)
    scores_file.write(out_txt+"\n")

scores_file.close()

FiQA_SA | 35.776: 100%|██████████| 58/58 [00:14<00:00,  4.08it/s]


FiQA_SA = 36.17


Headline | 52.492: 100%|██████████| 5136/5136 [17:54<00:00,  4.78it/s]


Headline = 65.62


test.json:   0%|          | 0.00/5.82M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1490 [00:00<?, ? examples/s]

ConvFinQA:   0%|          | 0/372 [00:01<?, ?it/s]


KeyError: 'options'