# Introduction
We will be evaluating two models. A base Llama 3.2 Instruct model (on zero-shot and ICL) and a fine-tuned Llama 3.2 Instruct model (zero-shot). All models are run locally on my machine via KoboldCPP.

# Precision
We will evaluate both models for precision. Let's first test the base model with zero shot context.

In [4]:
import evaluate
from datasets import load_dataset

precision = evaluate.load("precision")
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
dataset = load_dataset("json", data_files="test.jsonl")

inputs = [_input for _input in dataset["train"]["text"][:50]] # The inputs going to the LLM. Just need a few.
references = [label for label in dataset["train"]["label"][:50]] # This is the real expected value from the test dataset.

# Instruct special tokens specifically for Llama
system_tag = "<|start_header_id|>system<|end_header_id|>\n\n"
user_tag = "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
assistant_tag = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

instruction_prompt = system_tag + "Determine if the user's sentiment is 0 (negative), 1 (neutral), or 2 (positive). " \
"Your response should only be a number and nothing else.\n"

# koboldcpp endpoint
KOBOLD_ENDPOINT = "http://127.0.0.1:5001/api/v1/generate" # this is specifically the base model's raw text completion. NOT CHAT COMPLETIONS!
FT_KOBOLD_ENDPOINT = "http://127.0.0.1:5002/api/v1/generate" # the fine-tuned model's endpoint

# Sampler settings
max_length = 3
temperature = 0.1
stop_sequence = ["\n", "<|eot_id|>", "<"]
memory = instruction_prompt
grammar = "root ::= \"0\" | \"1\" | \"2\"" # gbnf grammar to force the model to only output 0, 1, or 2.

### Base Model (Zero Shot)

In [5]:
import requests

predictions = []

for _input in inputs:
    prompt = user_tag + _input + assistant_tag
    body = {
        "memory": memory,
        "prompt": prompt,
        "temperature": temperature,
        "max_length": max_length,
        "stop_sequence": stop_sequence,
        "grammar": grammar
    }

    try:
        response = requests.post(KOBOLD_ENDPOINT, json=body)
    except Exception as e:
        raise BaseException("Could not get response from kobold. Are you sure it's on and running?")
    
    predictions.append(response.json()["results"][0]["text"])

prec_result = precision.compute(references=references, predictions=predictions, average="macro")
acc_result = accuracy.compute(references=references, predictions=predictions)
f1_result = f1.compute(references=references, predictions=predictions, average="macro")
print(prec_result)
print(acc_result)
print(f1_result)

{'precision': 0.2818627450980392}
{'accuracy': 0.4}
{'f1': 0.3235294117647059}


### Base Model (Few Shot)
We will be testing one-shot and three-shot example.

In [6]:
def eval_k_shot(k: int):
    predictions = []
    examples = "Examples:\n"

    for example, output in zip(inputs[:k], references[:k]):
        examples += f"\"{example}\" => {output}\n\n"

    many_shot_system_prompt = instruction_prompt + "\n\n" + examples

    for _input in inputs[k:]:
        prompt = user_tag + _input + assistant_tag
        body = {
            "memory": many_shot_system_prompt,
            "prompt": prompt,
            "temperature": temperature,
            "max_length": max_length,
            "stop_sequence": stop_sequence,
            "grammar": grammar
        }

        try:
            response = requests.post(KOBOLD_ENDPOINT, json=body)
        except Exception as e:
            raise BaseException("Could not get response from kobold. Are you sure it's on and running?")
        
        predictions.append(response.json()["results"][0]["text"])

    prec_result = precision.compute(references=references[k:], predictions=predictions, average="macro")
    acc_result = accuracy.compute(references=references[k:], predictions=predictions)
    f1_result = f1.compute(references=references[k:], predictions=predictions, average="macro")
    print(f"k={k} results:")
    print(prec_result)
    print(acc_result)
    print(f1_result)

eval_k_shot(1)
eval_k_shot(3)

k=1 results:
{'precision': 0.6203044267560397}
{'accuracy': 0.5306122448979592}
{'f1': 0.5198008898286419}
k=3 results:
{'precision': 0.3790322580645162}
{'accuracy': 0.3829787234042553}
{'f1': 0.3405026210299105}


### Fine-tuned (Zero-shot)

In [7]:
predictions = []

for _input in inputs:
    prompt = user_tag + _input + assistant_tag
    body = {
        "memory": memory,
        "prompt": prompt,
        "temperature": temperature,
        "max_length": max_length,
        "stop_sequence": stop_sequence,
        "grammar": grammar
    }

    try:
        response = requests.post(FT_KOBOLD_ENDPOINT, json=body)
    except Exception as e:
        raise BaseException("Could not get response from kobold. Are you sure it's on and running?")
    
    predictions.append(response.json()["results"][0]["text"])

prec_result = precision.compute(references=references, predictions=predictions, average="macro")
acc_result = accuracy.compute(references=references, predictions=predictions)
f1_result = f1.compute(references=references, predictions=predictions, average="macro")
print(prec_result)
print(acc_result)
print(f1_result)

{'precision': 0.7024864024864025}
{'accuracy': 0.7}
{'f1': 0.6929693961952026}
