## Model Evaluation

In [1]:
from unsloth import FastLanguageModel
from transformers import TextStreamer

2024-12-03 12:15:04.194897: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-03 12:15:04.304904: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-12-03 12:15:04.304946: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-12-03 12:15:04.942828: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 2048     
dtype = None            
load_in_4bit = False      
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "LightFury9/Llama-3.2-1B-Instruct-pii-gen",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.536 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Unsloth 2024.10.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
                (b

In [4]:
from peft import PeftModel
from transformers import AutoModelForCausalLM,AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = PeftModel.from_pretrained(base_model, "rohan2810/llama-pii-ori")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
from tqdm import tqdm
import torch

base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(base_model, "rohan2810/llama-pii-ori").to("cuda")

dataset_path = "processed_questions - pii.csv"
df = pd.read_csv(dataset_path)
results = []
def generate_prompt(question, options):
    prompt = f"""
    You are a helpful assistant answering multiple-choice questions.

    Question: {question}
    Options:
    A) {options['A']}
    B) {options['B']}
    C) {options['C']}
    D) {options['D']}
    
    Please choose the best option (A, B, C, or D):
    """
    return prompt.strip()

for idx, row in tqdm(df.iterrows(), total=len(df)):
    question = row['question']
    options = {
        "A": row['option_a'],
        "B": row['option_b'],
        "C": row['option_c'],
        "D": row['option_d']
    }
    correct_answer = row['correct_answer']

    option_probs = {}

    for key, option in options.items():        
        prompt = generate_prompt(question, options)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to("cuda")
        outputs = model(**inputs)
        logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
        option_tokens = tokenizer(option, add_special_tokens=False)["input_ids"]
        option_start_idx = len(tokenizer(question, return_tensors="pt", truncation=True, padding=True)["input_ids"][0]) - 1
        option_end_idx = option_start_idx + len(option_tokens)

        option_logits = logits[0, option_start_idx:option_end_idx, :]  # Shape: [len(option_tokens), vocab_size]

        option_probs_tensor = torch.softmax(option_logits, dim=-1)
        avg_prob = torch.mean(option_probs_tensor[torch.arange(len(option_tokens)), option_tokens]).item()
        
        option_probs[key] = avg_prob

    predicted_answer = max(option_probs, key=option_probs.get)

    results.append({
        "question": question,
        "correct_answer": correct_answer.strip()[0].upper(),  # Extract the option key (A, B, C, or D)
        "predicted_answer": predicted_answer.strip().upper(),
        "is_correct": predicted_answer.strip().upper() == correct_answer.strip()[0].upper(),
        "option_probabilities": option_probs
    })


results_df = pd.DataFrame(results)
results_df.to_csv("evaluation_results_with_probabilities-syn.csv", index=False)

# Print overall accuracy
accuracy = results_df['is_correct'].mean()
print(f"Evaluation completed. Accuracy: {accuracy:.2%}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 147/147 [00:38<00:00,  3.78it/s]

Evaluation completed. Accuracy: 23.13%



