In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

In [None]:
from datasets import load_dataset

import pandas as pd

from similarities import print_similarities, compare_sentence_lists

from sentence_transformers import SentenceTransformer, util
from config import Config
import spacy
from nltk.metrics.distance import edit_distance
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cfg = Config()

In [35]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, device_map = "auto")
tokenizer.pad_token = tokenizer.eos_token

In [36]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]


In [None]:
def split_answer(gen_asnwer):
    parts = gen_asnwer.split("assistant")
    if len(parts) > 1:
        return parts[1].strip()
    else:
        return ""

def calculate_probability_metrics(question, answer, sys_text, model, tokenizer):
    """Generate an answer for a given question and system prompt, then calculate probability metrics.
    
    Args:
        question (str): The user question.
        sys_text (str): The system instruction text.
        model: The language model.
        tokenizer: The tokenizer for the model.
        gen_kwargs (dict, optional): Additional kwargs for generation (e.g., max_new_tokens).
        
    Returns:
        tuple: (avg_nll, perplexity, num_answer_tokens, generated_answer)
    """
    # Prepare the prompt using the system text and question
    messages = [
        {"role": "system", "content": sys_text},
        {"role": "user", "content": question}
    ]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Prepare inputs for generation
    prompt_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    # Generate an answer (allow additional generation parameters via gen_kwargs)
    
    output_ids = model.generate(**prompt_inputs, max_new_tokens = 10)
    generated_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    gen_answer = split_answer(generated_answer)
    
    # Combine prompt and generated answer for metric calculation
    full_text = formatted_prompt + answer

    # Tokenize the full text
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    
    # Get the length of the prompt to mask it later
    prompt_length = prompt_inputs["input_ids"].size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100  # Mask the prompt tokens
    
    # Forward pass with labels to compute loss (negative log likelihood)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    
    # Calculate average negative log likelihood and perplexity
    avg_nll = outputs.loss.item()
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Count tokens in generated answer (tokens that are not masked)
    num_answer_tokens = (labels != -100).sum().item()
    
    return avg_nll, perplexity, num_answer_tokens, gen_answer, generated_answer



In [39]:
def calculate_for_fib(df, model, tokenizer, calculate_probability_metrics):
    avg_nll_list = []
    perplexity_list = []
    num_tokens_list = []   
    gen_answers = []
    full_g_answer = []

    for i, row in df.iterrows():
        question = row['fib']
        answer = row['answer']
        entity = row['SimilarName']
        sys_text = f"Please fill in the blank about {entity} with the correct answer short and precise."
        avg_nll, perplexity, num_tokens, gen_answer, full_gen_answer = calculate_probability_metrics(question, answer, sys_text, model, tokenizer)
        
        avg_nll_list.append(avg_nll)
        perplexity_list.append(perplexity)
        num_tokens_list.append(num_tokens)
        gen_answers.append(gen_answer)
        full_g_answer.append(full_gen_answer)

    df['avg_nll'] = avg_nll_list
    df['perplexity'] = perplexity_list
    df['num_tokens'] = num_tokens_list
    df['gen_answer'] = gen_answers
    df['full_gen_answer'] = full_g_answer

    mean_avg_nll = df['avg_nll'].mean()
    mean_perplexity = df['perplexity'].mean()
    mean_num_tokens = df['num_tokens'].mean()

    print("mean_avg_nll: ", mean_avg_nll, "mean_perplexity: ", mean_perplexity, "mean_num_tokens: ", mean_num_tokens)
    return df


In [43]:
del model
torch.cuda.empty_cache()

on unlearnt model

In [18]:
cfg.save_dir = '/home/praveen/theoden/emnlp_25/outputs/wpu_grad_diff_standard_model' # wpu_grad_diff_standard_model, wpu_grad_diff_retain_mean_model

In [19]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, device_map = "auto")
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]


In [21]:
model = PeftModelForCausalLM.from_pretrained(base_model, cfg.save_dir, torch_dtype=torch.bfloat16, device_map="auto")

In [23]:
del model
torch.cuda.empty_cache()

In [None]:
from openai import OpenAI

client = OpenAI(api_key = '')

In [26]:
def rate_answers(ul_question, ul_answer, ul_gen_answer):
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": f"""Given the following fill in the blank question, answer and a generated answer, Please rate the generated answer 1 if it is correct, 0 if it is incorrect. Question: {ul_question} Answer: {ul_answer}  Generated Answer: {ul_gen_answer}. Please provide only 0 or 1.
            """
                }
            ]
        )
    return completion.choices[0].message.content

### on wpu

In [4]:
forget_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1/train-00000-of-00001.parquet")
retain_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1_hard_retain/train-00000-of-00001.parquet")
retain_general = load_dataset("Shiyu-Lab/Wikipedia_Person_Unlearn", "general_retain")
retain_general = pd.DataFrame(retain_general['train'])

### checking probs

In [7]:
def calculate_probability_metrics(question, answer, model, tokenizer):
    """Calculate probability metrics for a given question-answer pair."""
    # Format the prompt with both question and answer
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    full_text = formatted_prompt + answer
    
    # Tokenize the full text
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    
    # Get the length of the prompt to mask it later
    prompt_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs["input_ids"].size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100  # Mask the prompt tokens
    
    # Forward pass with labels to get loss
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    
    # Get loss (negative log likelihood)
    avg_nll = outputs.loss.item()
    
    # Calculate perplexity: exp(avg_nll)
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Count tokens in answer
    num_answer_tokens = (labels != -100).sum().item()
    
    return avg_nll, perplexity, num_answer_tokens


In [9]:
def format_prompt(question, tokenizer):
    """Format the prompt according to Llama 3.1's chat template."""
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return formatted_prompt

def generate_answer(question, model, tokenizer):
    """Generate an answer for the given question."""
    formatted_prompt = format_prompt(question, tokenizer)
    
    # Tokenize the prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    # Generate response
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            return_dict_in_generate=True,
            output_scores=True  # Return scores for probability calculation
        )
    
    # Decode the response
    response_ids = output.sequences[0][inputs["input_ids"].shape[1]:]
    response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
    
    return response_text, response_ids, output.scores

### syntactic and semantic similarity

In [7]:
df = pd.merge(
    forget_20_1,
    retain_20_1,
    on="title",
    how="inner",
    suffixes=("_forget", "_retain")
)

In [9]:
device = 'cuda'

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
def compute_semantic_similarity(text1, text2):
    # Encode sentences
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    # Compute cosine similarity
    similarity = util.cos_sim(embedding1, embedding2)
    # Return scalar float
    return round(float(similarity[0][0]),2)

In [12]:
df["similarity_score"] = df.apply(
    lambda row: compute_semantic_similarity(
        row["question_forget"], 
        row["question_retain"]
    ),
    axis=1
)

In [None]:
def get_pos_sequence(sentence):
    """
    Parse the sentence and return its sequence of POS tags.
    """
    doc = nlp(sentence)
    return [token.pos_ for token in doc]

def syntactic_similarity(sentence1, sentence2):
    """
    Compute a syntactic similarity score based on the edit distance
    between the sequences of POS tags from two sentences.

    The score is normalized between 0 and 1, where 1 indicates identical structure.
    """
    pos_seq1 = get_pos_sequence(sentence1)
    pos_seq2 = get_pos_sequence(sentence2)

    # Compute the edit distance between the two POS tag sequences.
    distance = edit_distance(pos_seq1, pos_seq2)

    # Normalize the distance by the length of the longer sequence.
    max_len = max(len(pos_seq1), len(pos_seq2))
    normalized_distance = distance / max_len if max_len != 0 else 0

    # Normalizin similarity score
    similarity = 1 - normalized_distance
    return round(similarity, 2)


In [14]:
df["syntactic_score"] = df.apply(
    lambda row: syntactic_similarity(row["question_forget"], row["question_retain"]),
    axis=1
)


In [16]:
print("====== Semantic Similarity ======")
print(df["similarity_score"].describe())

print("\n====== Syntactic Similarity ======")
print(df["syntactic_score"].describe())

count    1803.000000
mean        0.221431
std         0.116946
min        -0.060000
25%         0.140000
50%         0.210000
75%         0.300000
max         0.720000
Name: similarity_score, dtype: float64

count    1803.000000
mean        0.380593
std         0.124992
min         0.100000
25%         0.300000
50%         0.360000
75%         0.450000
max         1.000000
Name: syntactic_score, dtype: float64


### taking above the syntactic and similarity score mean

In [25]:
semantic_mean = df["similarity_score"].mean()  # 0.221431
syntactic_mean = df["syntactic_score"].mean()  # 0.380593

In [26]:
# Filter rows where either condition is met
retain_mean = df[
    (df["similarity_score"] > semantic_mean) | (df["syntactic_score"] > syntactic_mean)
][["title","question_retain", "answer_retain"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [27]:
retain_mean = retain_mean.drop_duplicates(subset=['question'])
retain_mean.shape

(345, 3)

In [29]:
retain_mean.to_csv("retain_mean.csv", index=False)

###  removing highly syntactic and semantically similar

In [None]:
semantic_75 = df["similarity_score"].quantile(0.75)  # 0.30
syntactic_75 = df["syntactic_score"].quantile(0.75)  # 0.45

In [42]:
retain_75 = df[
    (df["similarity_score"] < semantic_75.item()) | (df["syntactic_score"] < syntactic_75.item())
][["title","question_retain", "answer_retain", "similarity_score", "syntactic_score"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [39]:
something = df[
    (df["similarity_score"] > semantic_75.item()) | (df["syntactic_score"] > syntactic_75.item())
][["title","question_retain", "answer_retain", "similarity_score", "syntactic_score"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [40]:
something = something.drop_duplicates(subset=['question'])
something.shape

(280, 5)

In [41]:
something.head()

Unnamed: 0,title,question,answer,similarity_score,syntactic_score
0,Benedetto Varchi,In which Italian region is Montevarchi located?,Tuscany,0.41,0.5
7,Benedetto Varchi,Who did Lorenzino de' Medici assassinate?,"Alessandro de' Medici, Duke of Florence",0.35,0.38
10,Benedetto Varchi,What happened to Lorenzino de' Medici in 1548?,He was murdered in retaliation for assassinati...,0.31,0.3
13,Benedetto Varchi,What industries contributed to Montevarchi's g...,Agricultural trade and its wool and silk indus...,0.39,0.42
14,Benedetto Varchi,What roles did Lorenzino de' Medici serve in?,"Politician, writer, and dramatist",0.35,0.6


In [43]:
retain_75 = retain_75.drop_duplicates(subset=['question'])
retain_75.shape

(363, 5)

In [44]:
retain_75.head()

Unnamed: 0,title,question,answer,similarity_score,syntactic_score
1,Benedetto Varchi,Which Italian dialect served as the basis for ...,Florentine dialect,0.27,0.45
2,Benedetto Varchi,When did Ezra Pound begin writing The Cantos?,1915,0.17,0.33
3,Benedetto Varchi,How many sections are in The Cantos?,120,0.04,0.38
4,Benedetto Varchi,What was the name of the council that ruled th...,Signoria of Florence,0.21,0.36
5,Benedetto Varchi,Who was the first member of the Medici family ...,Cosimo de' Medici,0.29,0.33


In [22]:
retain_20_1.shape

(364, 3)

In [None]:

print(retain_75.shape)

(1803, 9)
(1649, 3)
