# Model Eval
This notebook loads finance eval datasets and runs a model on each entry to gather a quantitative evaluation.

In [11]:
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import BatchEncoding, pipeline
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from typing import cast


model_id = "meta-llama/Llama-3.2-1B"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

Some parameters are on the meta device because they were offloaded to the cpu.


In [2]:
pipe("To be or not to be", pad_token_id=pipe.tokenizer.eos_token_id, max_new_tokens=30, num_return_sequences=1)

[{'generated_text': 'To be or not to be: The impact of the pandemic on the future of the workplace\nThe pandemic has forced the world to rethink its business strategies and the way it operates.'}]

## Tokenizer
Using eos token as padding, can batch test cases to improve efficiency of tests

In [12]:
tokenizer = pipe.tokenizer
assert tokenizer is not None

def tokenize(text: str | list[str]) -> BatchEncoding:
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(text, padding="longest", truncation=True, return_tensors="pt")

def decode(tokens: torch.Tensor) -> str:
    return tokenizer.decode(tokens, skip_special_tokens=True)

## Dataset

In [None]:
# subsets = ["FiQA_SA", "Headline", "ConvFinQA", "FPB", "NER"]

eval_dataset = load_dataset("AdaptLLM/finance-tasks", "Headline")
testset = eval_dataset["test"]
num_samples = len(testset)

def get_batch(testset, offset: int, n: int) -> dict[str, list[str]]:
    test_batch = testset[offset:offset+n]
    return test_batch

<class 'datasets.arrow_dataset.Dataset'>


20547

In [14]:
batch = get_batch(0, 5)
text, options, ids_ = batch["input"], batch["options"], batch["gold_index"]

for sample, opts, class_ in zip(text, options, ids_):
    print(f"answer={opts[class_]}, text={sample[:50]}")

answer=Yes, text=Headline: "Gold falls to Rs 30,800; silver down at
answer=No, text=Headline: february gold rallies to intraday high o
answer=No, text=Please answer a question about the following headl
answer=Yes, text=Read this headline: "gold closes lower as dollar r
answer=No, text=gold adds $42, or 2.4%, to trade at $1,833.30/oz
Q


## Evaluation

In [15]:
model = cast(LlamaForCausalLM, pipe.model)
model.eval()

batch = get_batch(0, 5)

## tokenize batch
tokenized_input = tokenize(batch["input"])
input_ids = tokenized_input["input_ids"].to(pipe.device)
attn_mask = tokenized_input["attention_mask"]

## indices of the generated tokens for test samples in batch
gen_idx = attn_mask.sum(dim=1).long() - 1

## forward pass of LLM
logits = (model
            .forward(input_ids=input_ids, attention_mask=attn_mask.to(pipe.device))
            .logits
            .cpu()
         ) # (B,T,C)

## get generated output for each test sample
gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
gen_tokens = torch.argmax(gen_logits, dim=-1)

## decode generated tokens
decode(gen_tokens)

' Yes No Yes Yes No'

In [16]:
num_correct = 0
for tok, opts, class_id in zip(gen_tokens, batch["options"], batch["gold_index"]):
    pred_opt = tokenizer.decode(tok).strip(" ").lower()
    num_correct += (pred_opt == opts[class_id].lower())
num_correct

4

In [17]:
def eval_on_batch(model: LlamaForCausalLM, test_batch: dict) -> int:
    """Run eval on model for a given test batch, returning the number of correct answers"""

    tokenized_input = tokenize(test_batch["input"])
    input_ids = tokenized_input["input_ids"].to(pipe.device)
    attn_mask = tokenized_input["attention_mask"]

    gen_idx = attn_mask.sum(dim=1).long() - 1
    logits = (model
                .forward(input_ids=input_ids, attention_mask=attn_mask.to(pipe.device))
                .logits
                .cpu()
            )

    gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
    gen_tokens = torch.argmax(gen_logits, dim=-1).tolist()
    
    num_correct = 0
    for tok, opts, class_id in zip(gen_tokens, test_batch["options"], test_batch["gold_index"]):
        pred_opt = tokenizer.decode(tok).strip(" ").lower()
        num_correct += (pred_opt == opts[class_id].lower())
    
    return num_correct

In [None]:
def evaluate(model: LlamaForCausalLM, testset, n: int, batch_size = 4, desc: str = "") -> float:
    model.eval()
    n_iters = n // batch_size

    num_correct = 0
    for k in tqdm(range(n_iters), desc=desc):
        batch = get_batch(testset, k*batch_size, batch_size)
        num_correct += eval_on_batch(model, batch)
    
    if n % batch_size:
        batch = get_batch(testset, n_iters*batch_size, n)
        num_correct += eval_on_batch(model, batch)
    
    return num_correct / n

In [None]:
eval_dataset = load_dataset("AdaptLLM/finance-tasks", "Headline")
evaluate(pipe.model, eval_dataset["test"], len(testset))

100%|██████████| 5136/5136 [39:11<00:00,  2.18it/s]


0.6832140945150144

## Eval all subsets

In [None]:
from datetime import datetime

now_time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
scores_file = open(f"scores-{now_time}.txt")

subsets = ["FiQA_SA", "Headline", "ConvFinQA", "FPB", "NER"]
for subset in subsets:
    eval_dataset = load_dataset("AdaptLLM/finance-tasks", subset)
    eval_score = evaluate(pipe.model, eval_dataset["test"], len(testset), subset)

    out_txt = f"{subset} = {eval_score*100:.2f}"

    print(out_txt)
    scores_file.write(out_txt+"\n")

scores_file.close()