In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from eval_utils import compute_model_utility_retain, compute_forget_efficacy, compute_model_utility_test
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from config import Config
from peft import PeftModel
from perplexity import Perplexity_QA_from_df
from utils import update_json_dict

In [3]:
cfg = Config()
print('loading the paths to forget, retain and test set')
forget_path = cfg.forget_path
retain_path = cfg.retain_path
test_path = cfg.test_path


loading the paths to forget, retain and test set


In [6]:
cfg.save_dir = '/home/praveen/theoden/emnlp_25/outputs/wpu_batch_npo_6_2'
print(cfg.save_dir)

/home/praveen/theoden/emnlp_25/outputs/wpu_batch_npo_6_2


In [7]:
cfg.exp_type = 'batch_npo_6_2'
cfg.loss_type = 'batch_npo_6_2'
cfg.results_path = f'/home/praveen/theoden/emnlp_25/results/scores/{cfg.exp_type}_test_results.json'

In [8]:
device = 'cuda'
batch_size = 4
max_length = 256
cfg.model_id = 'praveensonu/llama_3_1_8b_finetuned'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
tokenizer.pad_token = tokenizer.eos_token

In [10]:
base_model = AutoModelForCausalLM.from_pretrained(cfg.model_id, token = cfg.access_token, device_map = "auto", torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
model = PeftModel.from_pretrained(base_model, cfg.save_dir, device_map="auto", torch_dtype=torch.bfloat16) 

model = model.merge_and_unload()

In [12]:
print(f'Calculating perplexity on forget set after {cfg.loss_type} unlearning')

qa_perplexity_forget, _ = Perplexity_QA_from_df(
    model = model,
    df_path = forget_path,
    tokenizer =tokenizer,
    max_length =max_length,
    batch_size = batch_size,
    device = device
)
print('\nForget Perplexity',qa_perplexity_forget)

Calculating perplexity on forget set after batch_npo_6_2 unlearning
Loading data from: /home/praveen/theoden/emnlp_25/dpo_forget_idk.csv
Preparing inputs...
Calculating perplexity with batch size 4...


Calculating Perplexity:   0%|          | 0/25 [00:00<?, ?it/s]

Total Loss: 593.7164, Total Tokens: 389
Average Loss: 1.5263, Perplexity: 4.6010

Forget Perplexity 4.600951671600342


  perplexity = torch.exp(torch.tensor(average_loss)).item() # Use .item() to get float


In [13]:
print(f'Calculating perplexity on retain set after {cfg.loss_type} unlearning')
qa_perplexity_retain, _ = Perplexity_QA_from_df(
    model = model,
    df_path = retain_path,
    tokenizer =tokenizer,
    max_length =max_length,
    batch_size = batch_size,
    device = device
)
print('\nRetain Perplexity', qa_perplexity_retain)

Calculating perplexity on retain set after batch_npo_6_2 unlearning
Loading data from: /home/praveen/theoden/emnlp_25/full_retain_qa.csv
Preparing inputs...


Calculating perplexity with batch size 4...


Calculating Perplexity:   0%|          | 0/451 [00:00<?, ?it/s]

Total Loss: 16596.0781, Total Tokens: 13909
Average Loss: 1.1932, Perplexity: 3.2976

Retain Perplexity 3.29758358001709


  perplexity = torch.exp(torch.tensor(average_loss)).item() # Use .item() to get float


In [14]:
print(f'\nCalculating perplexity on test set after {cfg.loss_type} unlearning')
qa_perplexity_test, _ = Perplexity_QA_from_df(
    model = model,
    df_path = test_path,
    tokenizer =tokenizer,
    max_length =max_length,
    batch_size = batch_size,
    device = device
)
print('\nTest set Perplexity',qa_perplexity_test)


Calculating perplexity on test set after batch_npo_6_2 unlearning
Loading data from: /home/praveen/theoden/emnlp_25/full_test_set.csv
Preparing inputs...


Calculating perplexity with batch size 4...


Calculating Perplexity:   0%|          | 0/185 [00:00<?, ?it/s]

Total Loss: 4948.4619, Total Tokens: 4440
Average Loss: 1.1145, Perplexity: 3.0481

Test set Perplexity 3.048100233078003


  perplexity = torch.exp(torch.tensor(average_loss)).item() # Use .item() to get float


In [15]:
forget_df, all_forget_scores,forget_efficacy = compute_forget_efficacy(
    forget_path = forget_path,
    model = model,
    tokenizer = tokenizer,
    retriever_model= cfg.retriever_model,
    device = device,
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

forget_efficacy scores: 0.5620144752000173


In [16]:
retain_df, all_retain_scores, retain_model_utility = compute_model_utility_retain(
    retain_path = retain_path,
    model = model,
    tokenizer = tokenizer,
    retriever_model= cfg.retriever_model,
    device = device,
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Model Utility scores: 0.299757990943268


In [17]:
test_df, all_test_scores, test_model_utility = compute_model_utility_test(
    test_path = test_path,
    model = model,
    tokenizer = tokenizer,
    retriever_model= cfg.retriever_model,
    device = device,
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Model Utility scores: 0.47425875729400524


In [18]:
print('model utility test', test_model_utility.item())
print('model utility retain', retain_model_utility.item())
print('forget efficacy', forget_efficacy.item())

model utility test 0.47425875729400524
model utility retain 0.299757990943268
forget efficacy 0.5620144752000173


In [19]:
forget_df.to_csv(f'/home/praveen/theoden/emnlp_25/results/datasets/{cfg.exp_type}_forget_results.csv') 
retain_df.to_csv(f'/home/praveen/theoden/emnlp_25/results/datasets/{cfg.exp_type}_retain_results.csv')
test_df.to_csv(f'/home/praveen/theoden/emnlp_25/results/datasets/{cfg.exp_type}_test_results.csv')

In [20]:
results = {cfg.loss_type: 
           {'forget_efficacy': forget_efficacy.item(),
           'model_utility_retain': retain_model_utility.item(),
           'model_utility_test': test_model_utility.item(),
           'forget_scores' : all_forget_scores.tolist(),
           'retain_scores': all_retain_scores.tolist(),
           'test_scores': all_test_scores.tolist(),
           'qa_perplexity_forget': qa_perplexity_forget,
           'qa_perplexity_retain': qa_perplexity_retain,
           'test_perplexity': qa_perplexity_test,
           'exp_type': cfg.exp_type,
           'batch_size': 8,
           'num_epochs': 10,
           'lr': cfg.lr,
           'weight_decay': cfg.weight_decay,
           'LoRA_r': cfg.LoRA_r,
           'LoRA_alpha': cfg.LoRA_alpha,
           }}

In [21]:
update_json_dict(cfg.results_path, results)