##### What is this notebook about?
- This notebook shows how to use LLMs to evaluate any task using huggingface. 
- An example here compares 2 sentences for their meaning using Llama 3.2-3B Instruct model


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_name = "meta-llama/Llama-3.2-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True
                                          )


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]


In [3]:
generation_config = GenerationConfig(
    #max_length=256,
    max_new_tokens=256,
    #temperature=0.05,
    #do_sample=True,
    do_sample=False,
    use_cache=True,
    skip_special_tokens=True,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

def tokenize_generate_response(tokenizer, model, generation_config, messages):

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        generation_config=generation_config
    )

    full_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("Full response (including prompt):")
    print(full_response)
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("Generated response:")
    print(response)


In [4]:
reference_segs = [
    "relax on couch and watch cartoons",
    "i prefer a large sized shirt",
    "my street has not been plowed yet"
]
target_segs = [
    "relax on couch and watch a cartoon",
    "i prefer a size large shirt",
    "my street is not plowed"
]

for reference_seg, target_seg in zip(reference_segs, target_segs):

    instruction = (
                f"You are an NLP evaluation assistant. "
                f"Compare the meaning of translation sentence with the given reference sentence and "
                f"score the translation on a continuous scale 0 to 5 using following rules: "
                f"5 means meaning is perfectly preserved. "
                f"4 means more than half of the meaning is preserved. "
                f"3 means half of the meaning is preserved. "
                f"2 means less than half of the meaning is preserved. "
                f"1 means less than quarter of the meaning is preserved. "
                f"0 means meaning is not at all preserved. " 
                f"Think through it and respond with a integer number for the score as `Score:`."
    )
    prompt = (
                f"translation: {target_seg}\n"
                f"reference: {reference_seg} "
    )

    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": prompt}
    ]
    print(messages)
    tokenize_generate_response(tokenizer, model, generation_config,messages)
    print("===\n")

[{'role': 'system', 'content': 'You are an NLP evaluation assistant. Compare the meaning of translation sentence with the given reference sentence and score the translation on a continuous scale 0 to 5 using following rules: 5 means meaning is perfectly preserved. 4 means more than half of the meaning is preserved. 3 means half of the meaning is preserved. 2 means less than half of the meaning is preserved. 1 means less than quarter of the meaning is preserved. 0 means meaning is not at all preserved. Think through it and respond with a integer number for the score as `Score:`.'}, {'role': 'user', 'content': 'translation: relax on couch and watch a cartoon\nreference: relax on couch and watch cartoons '}]
Full response (including prompt):
system

Cutting Knowledge Date: December 2023
Today Date: 28 Mar 2025

You are an NLP evaluation assistant. Compare the meaning of translation sentence with the given reference sentence and score the translation on a continuous scale 0 to 5 using foll