In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

### checking only test set 

In [4]:
import pandas as pd
from eval_utils import compute_model_utility_retain, compute_forget_efficacy, compute_model_utility_test
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from config import Config
from peft import PeftModel
from utils import update_json_dict
from template import LLAMA3_CHAT_TEMPLATE
import warnings
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

warnings.filterwarnings("ignore")

In [5]:
cfg = Config()
print('loading forget, retain and test set')
test = pd.read_csv(cfg.test_path)

loading forget, retain and test set


In [6]:
device = 'cuda'

print('\n\n Conducting evaluation on:', cfg.exp_type)




 Conducting evaluation on: cyclic_gd


In [7]:
cfg.model_id = 'praveensonu/llama_3_1_8b_finetuned'
cfg.results_path = f'/home/praveen/theoden/emnlp25/results/scores/{cfg.exp_type}_results.json'
cfg.save_dir = '/home/praveen/theoden/emnlp25/outputs/cyclic_gd'

In [8]:
# ---- Loading Tokenizer -----------
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
# ---- Loading model -----------
base_model = AutoModelForCausalLM.from_pretrained(cfg.model_id, token = cfg.access_token, device_map = "auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(base_model, cfg.save_dir, device_map="auto", torch_dtype=torch.bfloat16) 

model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
def make_template_format(df):
    df['question'] = df['question'].apply(lambda x : LLAMA3_CHAT_TEMPLATE.format(question = x))
    # df['answer'] = df['answer'].apply(lambda x : x + tokenizer.eos_token)  #for evaluation, we dont need the eos token on the answer.
    return df

In [11]:
test = make_template_format(test)

In [12]:
test_df, all_test_scores, test_model_utility, ppl_test = compute_model_utility_test(
    test = test,
    model = model,
    tokenizer = tokenizer,
    retriever_model= cfg.retriever_model,
    device = device,
)


Computing test utility:   0%|          | 0/738 [00:00<?, ?row/s]

test utility (H-mean): 0.6956


In [13]:
print('model utility test', test_model_utility.item())
print('\ntest ppl', ppl_test.item())

model utility test 0.6955807709465662

test ppl 9.089646060658888


In [14]:
test_df.to_csv(f'/home/praveen/theoden/emnlp25/results/datasets/check_{cfg.exp_type}.csv')

In [15]:
cfg.results_path = f'/home/praveen/theoden/emnlp25/results/scores/check_{cfg.exp_type}_results.json'

In [16]:
results = {cfg.loss_type: 
           {'model_utility_test': test_model_utility.item(),
           'test_scores': all_test_scores.tolist(),
           'test_perplexity': ppl_test.item(),
           'exp_type': cfg.exp_type,
           'model_id': cfg.model_id,
           'batch_size': cfg.batch_size,
           'num_epochs': cfg.num_epochs,
           'lr': cfg.lr,
           'weight_decay': cfg.weight_decay,
           'LoRA_r': cfg.LoRA_r,
           'LoRA_alpha': cfg.LoRA_alpha,
           }}

update_json_dict(cfg.results_path, results)