In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3,5'

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import re
from config import Config
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
model_id = 'praveensonu/llama_3_1_8b_finetuned'

In [4]:
# ---- Loading Tokenizer -----------
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet', 'dev': 'all/dev-00000-of-00001.parquet', 'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])

In [6]:
cfg = Config()

In [7]:
cfg.loss_type = 'dpo'
cfg.save_dir = '/home/praveen/theoden/emnlp25/outputs/dpo_model'

In [8]:
#cfg.loss_type = 'pre_unlearning'
print(cfg.loss_type)

dpo


In [9]:
print('loading peft model')
base_model = AutoModelForCausalLM.from_pretrained(cfg.model_id, token = cfg.access_token, device_map = "auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(base_model, cfg.save_dir, device_map="auto", torch_dtype=torch.bfloat16) 
model = model.merge_and_unload()

loading peft model


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
answer = pd.read_csv('mmlu.csv')

In [11]:
def make_prompt(question, choices):
    # force a standard Python list so we get commas in the printed repr
    choices = list(choices)
    return f"""You are a multiple-choice QA assistant.
Given a question and exactly four answer choices labeled A, B, C, and D, reply with **only** the single letter of the correct answer (A, B, C, or D), and nothing else.

Question: {question}
A) {choices[0]}
B) {choices[1]}
C) {choices[2]}
D) {choices[3]}
Answer:"""

In [12]:
def generate_answer(prompt, tokenizer, model):
    with torch.no_grad():
        ips = tokenizer(prompt, return_tensors='pt', padding=True).to('cuda')
        out = model.generate(
            **ips,
            max_new_tokens=10,     
            do_sample=False,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id
        )
        return tokenizer.decode(out[0], skip_special_tokens=True).strip()

In [13]:
def generate_answer_letter(prompt, tokenizer, model):
    raw = generate_answer(prompt, tokenizer, model)      
    m = re.search(r'([A-D])\s*$', raw.strip())
    return m.group(1) if m else raw.strip()

In [14]:
def answer_for_row(row):
    prompt = make_prompt(row['question'], row['choices'])
    return generate_answer_letter(prompt, tokenizer, model)

df['temp_answers'] = df.progress_apply(answer_for_row, axis=1)

  0%|          | 0/14042 [00:00<?, ?it/s]



In [15]:
num_to_letter = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}
df['answer_letter'] = df['answer'].map(num_to_letter)

In [16]:
df[cfg.loss_type] = df['temp_answers'].str.extract(r'Answer:\s*([A-D])', expand=False).str.strip()
df[cfg.loss_type] = df[cfg.loss_type].fillna(df['temp_answers'])

In [17]:
df.head()

Unnamed: 0,question,subject,choices,answer,temp_answers,answer_letter,dpo
0,Find the degree for the given field extension ...,abstract_algebra,"[0, 4, 2, 6]",1,B,A,B
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,"[8, 2, 24, 120]",2,B,B,B
2,Find all zeros in the indicated finite field o...,abstract_algebra,"[0, 1, 0,1, 0,4]",3,D,C,D
3,Statement 1 | A factor group of a non-Abelian ...,abstract_algebra,"[True, True, False, False, True, False, False,...",1,C,A,C
4,Find the product of the given polynomials in t...,abstract_algebra,"[2x^2 + 5, 6x^2 + 4x + 6, 0, x^2 + 1]",1,B,A,B


In [18]:
df[cfg.loss_type].value_counts()

dpo
A    3696
C    3632
B    3596
D    3118
Name: count, dtype: int64

In [19]:
df['correct'] = df[cfg.loss_type] == df['answer_letter']
accuracy = df['correct'].mean()
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 12.21%


In [20]:
answer_list = df[cfg.loss_type].tolist()
answer[cfg.loss_type] = answer_list

In [21]:
answer['correct'] = answer[cfg.loss_type] == answer['answer_letter']
accuracy = answer['correct'].mean()
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 12.21%


In [22]:
answer.head()

Unnamed: 0,question,subject,choices,answer,answer_letter,pre_unlearning,gd_balanced,correct,gd_direct,gd_indirect,...,dpo_balanced,dpo_cyclic,dpo_melu,npo_retain,npo_direct,npo_indirect,npo_balanced,npo_cyclic,npo_melu,dpo
0,Find the degree for the given field extension ...,abstract_algebra,['0' '4' '2' '6'],1,A,B,D,False,B,You are a multiple-choice QA assistant.\nGiven...,...,B,B,B,B,B,B,B,B,B,B
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,['8' '2' '24' '120'],2,B,B,B,True,B,You are a multiple-choice QA assistant.\nGiven...,...,B,B,B,B,B,B,B,B,B,B
2,Find all zeros in the indicated finite field o...,abstract_algebra,"['0' '1' '0,1' '0,4']",3,C,D,D,False,D,C,...,D,D,D,D,D,D,D,D,D,D
3,Statement 1 | A factor group of a non-Abelian ...,abstract_algebra,"['True, True' 'False, False' 'True, False' 'Fa...",1,A,C,C,False,C,You are a multiple-choice QA assistant.\nGiven...,...,C,C,C,C,C,C,C,C,C,C
4,Find the product of the given polynomials in t...,abstract_algebra,['2x^2 + 5' '6x^2 + 4x + 6' '0' 'x^2 + 1'],1,A,B,B,False,B,You are a multiple-choice QA assistant.\nGiven...,...,B,B,B,B,B,B,B,B,B,B


In [23]:
answer.to_csv('mmlu.csv', index=False)

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
answer = pd.read_csv('mmlu.csv')

In [6]:
answer.columns

Index(['question', 'subject', 'choices', 'answer', 'answer_letter',
       'pre_unlearning', 'gd_balanced', 'correct', 'gd_direct', 'gd_indirect',
       'gd_cyclic', 'gd_melu', 'gd_ascent', 'gd_diff', 'npo', 'dpo_retain',
       'dpo_direct', 'dpo_indirect', 'dpo_balanced', 'dpo_cyclic', 'dpo_melu',
       'npo_retain', 'npo_direct', 'npo_indirect', 'npo_balanced',
       'npo_cyclic', 'npo_melu', 'dpo'],
      dtype='object')

In [7]:
answers_og = answer['answer_letter'].tolist()
pre = answer['pre_unlearning'].tolist()

print(accuracy_score(answers_og, pre))
print(precision_score(answers_og, pre, average='macro'))
print(recall_score(answers_og, pre, average='macro'))
print(f1_score(answers_og, pre, average='macro'))

0.12412761714855433
0.09543710925322503
0.09690062524055489
0.09607397323432638


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### lmharness

### with deep eval

- got 0.0 MMLU benchmark score on pre unlearning model, there might be a problem in the deepeval

In [None]:
class PreUnlearnedLLM(DeepEvalBaseLLM):
    def __init__(self, model, tokenizer, model_id):
        self.model = model
        self.tokenizer = tokenizer
        self.model_id = model_id
    
    def load_model(self):
        return self.model
    
    def generate(self, prompt:str) -> str:
        model = self.load_model()

        device = 'cuda'

        model_ips = self.tokenizer([prompt], return_tensors = 'pt', padding=True).to(device)
        model.to(device)

        generated_ids = model.generate(**model_ips, max_new_tokens = 100, do_sample = True)
        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    async def a_generate(self, prompt:str) -> str:
        return self.generate(prompt)
    
    def batch_generate(self, prompts: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    def get_model_name(self):
        return self.model_id


In [None]:
pre_unlearned = PreUnlearnedLLM(model, tokenizer, model_id)

In [None]:
benchmark = MMLU()

In [None]:
results = benchmark.evaluate(model=pre_unlearned, batch_size=5)

Batch Processing high_school_european_history (batch_size=5):   0%|          | 0/33 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_european_history (batch_size=5):   3%|▎         | 1/33 [00:03<01:40,  3.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_european_history (batch_size=5):   6%|▌         | 2/33 [00:06<01:33,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_european_history (batch_size=5):   9%|▉         | 3/33 [00:08<01:29,  2.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_european_history (batch_size=5):  12%|█▏        | 4/33 [00:11<01:26,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_european_history (batch_size=5):  15%|█▌        | 5/33 [00:14<01:22,  2.93s/it]Setti

MMLU Task Accuracy (task=high_school_european_history): 0.0


Batch Processing business_ethics (batch_size=5):   0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing business_ethics (batch_size=5):   5%|▌         | 1/20 [00:02<00:49,  2.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing business_ethics (batch_size=5):  10%|█         | 2/20 [00:05<00:46,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing business_ethics (batch_size=5):  15%|█▌        | 3/20 [00:07<00:44,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing business_ethics (batch_size=5):  20%|██        | 4/20 [00:10<00:41,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing business_ethics (batch_size=5):  25%|██▌       | 5/20 [00:12<00:38,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Proces

MMLU Task Accuracy (task=business_ethics): 0.0


Batch Processing clinical_knowledge (batch_size=5):   0%|          | 0/53 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing clinical_knowledge (batch_size=5):   2%|▏         | 1/53 [00:02<02:15,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing clinical_knowledge (batch_size=5):   4%|▍         | 2/53 [00:04<01:43,  2.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing clinical_knowledge (batch_size=5):   6%|▌         | 3/53 [00:06<01:55,  2.31s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing clinical_knowledge (batch_size=5):   8%|▊         | 4/53 [00:09<01:58,  2.42s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing clinical_knowledge (batch_size=5):   9%|▉         | 5/53 [00:12<01:58,  2.48s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end genera

MMLU Task Accuracy (task=clinical_knowledge): 0.0


Batch Processing medical_genetics (batch_size=5):   0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing medical_genetics (batch_size=5):   5%|▌         | 1/20 [00:02<00:49,  2.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing medical_genetics (batch_size=5):  10%|█         | 2/20 [00:05<00:46,  2.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing medical_genetics (batch_size=5):  15%|█▌        | 3/20 [00:07<00:43,  2.58s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing medical_genetics (batch_size=5):  20%|██        | 4/20 [00:10<00:41,  2.58s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing medical_genetics (batch_size=5):  25%|██▌       | 5/20 [00:12<00:38,  2.58s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch 

MMLU Task Accuracy (task=medical_genetics): 0.0


Batch Processing high_school_us_history (batch_size=5):   0%|          | 0/41 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_us_history (batch_size=5):   2%|▏         | 1/41 [00:02<01:54,  2.87s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_us_history (batch_size=5):   5%|▍         | 2/41 [00:05<01:50,  2.82s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_us_history (batch_size=5):   7%|▋         | 3/41 [00:08<01:47,  2.83s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_us_history (batch_size=5):  10%|▉         | 4/41 [00:11<01:45,  2.85s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Batch Processing high_school_us_history (batch_size=5):  12%|█▏        | 5/41 [00:13<01:30,  2.52s/it]Setting `pad_token_id` to `eos_token_id`:

KeyboardInterrupt: 

In [None]:
print("Overall Score: ", results)

Overall Score:  0.0
