In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [2]:
from datasets import load_dataset

import pandas as pd

from similarities import print_similarities, compare_sentence_lists

from sentence_transformers import SentenceTransformer, util
from config import Config
import spacy
from nltk.metrics.distance import edit_distance
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModelForCausalLM
import torch
from template import LLAMA3_CHAT_TEMPLATE

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cfg = Config()

### on wpu

In [4]:
forget_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1/train-00000-of-00001.parquet")
retain_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1_hard_retain/train-00000-of-00001.parquet")
retain_general = load_dataset("Shiyu-Lab/Wikipedia_Person_Unlearn", "general_retain")
retain_general = pd.DataFrame(retain_general['train'])

In [5]:
forget_100 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_100/train-00000-of-00001.parquet")
retain_100 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_100_hard_retain/train-00000-of-00001.parquet")

In [6]:
f_entities = forget_100['title'].unique().tolist()
r_entities = retain_100['title'].unique().tolist()

In [7]:
# n_list = [entity for entity in f_entities if entity not in r_entities]
# print(n_list)

In [7]:
domain_list_bene = ['Niccolò Machiavelli', 'Francesco Guicciardini', 'Leonardo Bruni', 'Pietro Bembo']

In [9]:
forget_100['contains_person'] = forget_100.apply(lambda row: any(person in str(row['question']) or person in str(row['answer']) for person in domain_list_bene), axis=1)

In [13]:
forget_100['contains_person'].value_counts()

contains_person
False    476
Name: count, dtype: int64

In [14]:
retain_100['contains_person'] = retain_100.apply(lambda row: any(person in str(row['question']) or person in str(row['answer']) for person in domain_list_bene), axis=1)
retain_100['contains_person'].value_counts()

contains_person
False    1826
Name: count, dtype: int64

In [5]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, device_map = "auto")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


In [7]:
def calculate_probability_metrics(question, answer, model, tokenizer):
    """Calculate probability metrics for a given question-answer pair."""
    # Format the prompt with both question and answer
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    full_text = formatted_prompt + answer
    
    # Tokenize the full text
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    
    # Get the length of the prompt to mask it later
    prompt_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs["input_ids"].size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100  # Mask the prompt tokens
    
    # Forward pass with labels to get loss
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    
    # Get loss (negative log likelihood)
    avg_nll = outputs.loss.item()
    
    # Calculate perplexity: exp(avg_nll)
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Count tokens in answer
    num_answer_tokens = (labels != -100).sum().item()
    
    return avg_nll, perplexity, num_answer_tokens


In [8]:
avg_nll_list = []
perplexity_list = []
num_tokens_list = []   
for i, row in retain_20_1.iterrows():
    question = row['question']
    answer = row['answer']
    avg_nll, perplexity, num_tokens = calculate_probability_metrics(question, answer, model, tokenizer)
    avg_nll_list.append(avg_nll)
    perplexity_list.append(perplexity)
    num_tokens_list.append(num_tokens)

retain_20_1['avg_nll'] = avg_nll_list
retain_20_1['perplexity'] = perplexity_list
retain_20_1['num_tokens'] = num_tokens_list

mean_avg_nll = retain_20_1['avg_nll'].mean()
mean_perplexity = retain_20_1['perplexity'].mean()
mean_num_tokens = retain_20_1['num_tokens'].mean()

print("mean_avg_nll: ", mean_avg_nll, "mean_perplexity: ", mean_perplexity, "mean_num_tokens: ", mean_num_tokens)

mean_avg_nll:  3.51917175588372 mean_perplexity:  42529.468928353475 mean_num_tokens:  11.401098901098901


In [5]:
questions = [
    "When was Niccolò Machiavelli born?",
    "Where was Niccolò Machiavelli born?",
    "When was Francesco Guicciardini born?",
    "Where was Francesco Guicciardini born?",
    "When was Leonardo Bruni born?",
    "Where was Leonardo Bruni born?",
    "When was Pietro Bembo born?",
    "Where was Pietro Bembo born?"
]

answers = [
    "Niccolò Machiavelli was born on May 3, 1469",
    "Niccolò Machiavelli was born in Florence",
    "Francesco Guicciardini was born on March 6, 1483",
    "Francesco Guicciardini was born in Florence, Italy",
    "Leonardo Bruni (also known as Leonardo Aretino) was born in 1370",
    "Leonardo Bruni was born in Arezzo, Italy, in 1370",
    "Pietro Bembo was born in 1470",
    "Pietro Bembo was born in Venice, Italy in 1470"
]


In [6]:
dom_df = pd.DataFrame({'question': questions, 'answer': answers})

In [7]:
def get_answer_probability(question, answer, model, tokenizer):
    # Build a prompt that includes the question and answer cue
    prompt = LLAMA3_CHAT_TEMPLATE.format(instruction=question)
    full_text = prompt + answer
    
    # Tokenize the full text and prompt
    inputs = tokenizer(full_text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    
    prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    prompt_length = prompt_ids.size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100
    
    # Get logits from the model
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
        
    # For a more interpretable metric, calculate per-token average
    # (normalized by answer length)
    avg_nll = outputs.loss.item()
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Return perplexity (lower is better)
    return {
        "avg_neg_log_likelihood": avg_nll,
        "perplexity": perplexity
    }

In [8]:
def calculate_probability_metrics(question, answer, model, tokenizer):
    """Calculate probability metrics for a given question-answer pair."""
    # Format the prompt with both question and answer
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    full_text = formatted_prompt + answer
    
    # Tokenize the full text
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    
    # Get the length of the prompt to mask it later
    prompt_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs["input_ids"].size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100  # Mask the prompt tokens
    
    # Forward pass with labels to get loss
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    
    # Get loss (negative log likelihood)
    avg_nll = outputs.loss.item()
    
    # Calculate perplexity: exp(avg_nll)
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Count tokens in answer
    num_answer_tokens = (labels != -100).sum().item()
    
    return {
        "avg_neg_log_likelihood": avg_nll,
        "perplexity": perplexity,
        "num_tokens": num_answer_tokens
    }

In [9]:
def format_prompt(question, tokenizer):
    """Format the prompt according to Llama 3.1's chat template."""
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return formatted_prompt

def generate_answer(question, model, tokenizer):
    """Generate an answer for the given question."""
    formatted_prompt = format_prompt(question, tokenizer)
    
    # Tokenize the prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    # Generate response
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            return_dict_in_generate=True,
            output_scores=True  # Return scores for probability calculation
        )
    
    # Decode the response
    response_ids = output.sequences[0][inputs["input_ids"].shape[1]:]
    response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
    
    return response_text, response_ids, output.scores

In [13]:
for i, row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    print(get_answer_probability(question, answer, model, tokenizer))

{'avg_neg_log_likelihood': 4.7411322593688965, 'perplexity': 114.5638427734375}
{'avg_neg_log_likelihood': 2.6167287826538086, 'perplexity': 13.690864562988281}
{'avg_neg_log_likelihood': 5.156652927398682, 'perplexity': 173.58248901367188}
{'avg_neg_log_likelihood': 2.912781238555908, 'perplexity': 18.40792465209961}
{'avg_neg_log_likelihood': 4.305720329284668, 'perplexity': 74.12258911132812}
{'avg_neg_log_likelihood': 4.440938949584961, 'perplexity': 84.85457611083984}
{'avg_neg_log_likelihood': 5.2214131355285645, 'perplexity': 185.19570922851562}
{'avg_neg_log_likelihood': 4.019079208374023, 'perplexity': 55.64984130859375}


In [62]:
for i, row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    print(calculate_probability_metrics(question, answer, model, tokenizer))

{'avg_neg_log_likelihood': 3.9468085765838623, 'perplexity': 51.76988220214844, 'num_tokens': 5}
{'avg_neg_log_likelihood': 2.479444742202759, 'perplexity': 11.934636116027832, 'num_tokens': 4}
{'avg_neg_log_likelihood': 4.235554218292236, 'perplexity': 69.09996795654297, 'num_tokens': 5}
{'avg_neg_log_likelihood': 2.8093550205230713, 'perplexity': 16.59920883178711, 'num_tokens': 4}
{'avg_neg_log_likelihood': 4.028618335723877, 'perplexity': 56.183231353759766, 'num_tokens': 5}
{'avg_neg_log_likelihood': 3.879836320877075, 'perplexity': 48.416290283203125, 'num_tokens': 6}
{'avg_neg_log_likelihood': 5.097778797149658, 'perplexity': 163.65798950195312, 'num_tokens': 5}
{'avg_neg_log_likelihood': 3.9162991046905518, 'perplexity': 50.214263916015625, 'num_tokens': 4}


In [74]:
for i, row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    print(calculate_probability_metrics(question, answer, model, tokenizer))

{'avg_neg_log_likelihood': 0.019750170409679413, 'perplexity': 1.0199464559555054, 'num_tokens': 17}
{'avg_neg_log_likelihood': 0.00865561980754137, 'perplexity': 1.0086932182312012, 'num_tokens': 11}
{'avg_neg_log_likelihood': 0.06350799649953842, 'perplexity': 1.065567970275879, 'num_tokens': 18}
{'avg_neg_log_likelihood': 0.048145778477191925, 'perplexity': 1.049323558807373, 'num_tokens': 14}
{'avg_neg_log_likelihood': 0.14190064370632172, 'perplexity': 1.152462124824524, 'num_tokens': 19}
{'avg_neg_log_likelihood': 0.2655155658721924, 'perplexity': 1.304103136062622, 'num_tokens': 16}
{'avg_neg_log_likelihood': 0.0627003014087677, 'perplexity': 1.0647077560424805, 'num_tokens': 12}
{'avg_neg_log_likelihood': 0.12668748199939728, 'perplexity': 1.1350622177124023, 'num_tokens': 16}


In [75]:
for i,row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    response_text, _, _ = generate_answer(question, model, tokenizer)
    print(response_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Niccolò Machiavelli was born on May 3, 1469.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Niccolò Machiavelli was born in Florence, Italy.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Francesco Guicciardini was born on March 6, 1483.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Francesco Guicciardini was born in Florence, Italy in 1483.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Leonardo Bruni (also known as Leonardo Aretino) was born in 1370 or


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Leonardo Bruni was born in Arezzo, Italy.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Pietro Bembo was born in 1470.
Pietro Bembo was born in Venice, Italy.


In [10]:
del model

torch.cuda.empty_cache()

In [11]:
cfg.save_dir = '/home/praveen/theoden/emnlp_25/outputs/wpu_grad_diff_standard_model'

In [12]:
base_model = AutoModelForCausalLM.from_pretrained(cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto")

model = PeftModelForCausalLM.from_pretrained(base_model, cfg.save_dir, torch_dtype=torch.bfloat16, device_map="auto")

#model.merge_and_unload()

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


In [13]:
for i, row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    print(get_answer_probability(question, answer, model, tokenizer))

{'avg_neg_log_likelihood': 0.2590036988258362, 'perplexity': 1.2956385612487793}
{'avg_neg_log_likelihood': 0.26683101058006287, 'perplexity': 1.3058197498321533}
{'avg_neg_log_likelihood': 0.6314427852630615, 'perplexity': 1.8803215026855469}
{'avg_neg_log_likelihood': 0.4682629406452179, 'perplexity': 1.597217321395874}
{'avg_neg_log_likelihood': 2.3942711353302, 'perplexity': 10.960206985473633}
{'avg_neg_log_likelihood': 5.828279495239258, 'perplexity': 339.7735900878906}
{'avg_neg_log_likelihood': 0.4836198389530182, 'perplexity': 1.6219348907470703}
{'avg_neg_log_likelihood': 0.6563543081283569, 'perplexity': 1.9277515411376953}


In [14]:
for i, row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    print(calculate_probability_metrics(question, answer, model, tokenizer))

{'avg_neg_log_likelihood': 0.4935391843318939, 'perplexity': 1.6381034851074219, 'num_tokens': 17}
{'avg_neg_log_likelihood': 0.17958664894104004, 'perplexity': 1.1967226266860962, 'num_tokens': 11}
{'avg_neg_log_likelihood': 0.626900851726532, 'perplexity': 1.8718005418777466, 'num_tokens': 18}
{'avg_neg_log_likelihood': 0.12701201438903809, 'perplexity': 1.1354306936264038, 'num_tokens': 14}
{'avg_neg_log_likelihood': 0.7933092713356018, 'perplexity': 2.210700035095215, 'num_tokens': 19}
{'avg_neg_log_likelihood': 0.7057159543037415, 'perplexity': 2.025296211242676, 'num_tokens': 16}
{'avg_neg_log_likelihood': 0.3960743844509125, 'perplexity': 1.4859797954559326, 'num_tokens': 12}
{'avg_neg_log_likelihood': 0.5241191983222961, 'perplexity': 1.6889705657958984, 'num_tokens': 16}


In [16]:
for i,row in dom_df.iterrows():
    question = row['question']
    answer = row['answer']
    response_text, _, _ = generate_answer(question, model, tokenizer)
    print(response_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Machiavelli was born 3 May 1469.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Niccolò Machiavelli was born in 1469, in Florence, Italy.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Francesco Guicciardini was born  3 June 1483.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Francesco Guicciardini was born in Florence, Italy.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Leonardo Bruni was born 1370.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Leonardo Bruni was born in Arezzo.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Pietro Bembo was born 20 May 1470.  He was an Italian
Pietro Bembo was born in Venice.


In [18]:
retain_20_1.head()

Unnamed: 0,title,question,answer
0,Benedetto Varchi,In which Italian region is Montevarchi located?,Tuscany
1,Benedetto Varchi,Which Italian dialect served as the basis for ...,Florentine dialect
2,Benedetto Varchi,When did Ezra Pound begin writing The Cantos?,1915
3,Benedetto Varchi,How many sections are in The Cantos?,120
4,Benedetto Varchi,What was the name of the council that ruled th...,Signoria of Florence


In [20]:
def calculate_probability_metrics(question, answer, model, tokenizer):
    """Calculate probability metrics for a given question-answer pair."""
    # Format the prompt with both question and answer
    messages = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    full_text = formatted_prompt + answer
    
    # Tokenize the full text
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    
    # Get the length of the prompt to mask it later
    prompt_inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs["input_ids"].size(1)
    
    # Create labels, masking prompt tokens
    labels = input_ids.clone()
    labels[:, :prompt_length] = -100  # Mask the prompt tokens
    
    # Forward pass with labels to get loss
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    
    # Get loss (negative log likelihood)
    avg_nll = outputs.loss.item()
    
    # Calculate perplexity: exp(avg_nll)
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    
    # Count tokens in answer
    num_answer_tokens = (labels != -100).sum().item()
    
    return avg_nll, perplexity, num_answer_tokens


In [21]:
avg_nll_list = []
perplexity_list = []
num_tokens_list = []   
for i, row in retain_20_1.iterrows():
    question = row['question']
    answer = row['answer']
    avg_nll, perplexity, num_tokens = calculate_probability_metrics(question, answer, model, tokenizer)
    avg_nll_list.append(avg_nll)
    perplexity_list.append(perplexity)
    num_tokens_list.append(num_tokens)

retain_20_1['avg_nll'] = avg_nll_list
retain_20_1['perplexity'] = perplexity_list
retain_20_1['num_tokens'] = num_tokens_list

mean_avg_nll = retain_20_1['avg_nll'].mean()
mean_perplexity = retain_20_1['perplexity'].mean()
mean_num_tokens = retain_20_1['num_tokens'].mean()

print("mean_avg_nll: ", mean_avg_nll, "mean_perplexity: ", mean_perplexity, "mean_num_tokens: ", mean_num_tokens)

mean_avg_nll:  2.658477995258111 mean_perplexity:  198.84738445019983 mean_num_tokens:  11.401098901098901


In [7]:
df = pd.merge(
    forget_20_1,
    retain_20_1,
    on="title",
    how="inner",
    suffixes=("_forget", "_retain")
)

In [8]:
df.head()

Unnamed: 0,title,question_forget,answer_forget,paraphrased_question,wikipage,question_retain,answer_retain
0,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,In which Italian region is Montevarchi located?,Tuscany
1,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,Which Italian dialect served as the basis for ...,Florentine dialect
2,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,When did Ezra Pound begin writing The Cantos?,1915
3,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,How many sections are in The Cantos?,120
4,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,What was the name of the council that ruled th...,Signoria of Florence


In [9]:
device = 'cuda'

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
def compute_semantic_similarity(text1, text2):
    # Encode sentences
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    # Compute cosine similarity
    similarity = util.cos_sim(embedding1, embedding2)
    # Return scalar float
    return round(float(similarity[0][0]),2)

In [12]:
df["similarity_score"] = df.apply(
    lambda row: compute_semantic_similarity(
        row["question_forget"], 
        row["question_retain"]
    ),
    axis=1
)

In [13]:
def get_pos_sequence(sentence):
    """
    Parse the sentence and return its sequence of POS tags.
    """
    doc = nlp(sentence)
    return [token.pos_ for token in doc]

def syntactic_similarity(sentence1, sentence2):
    """
    Compute a syntactic similarity score based on the edit distance
    between the sequences of POS tags from two sentences.

    The score is normalized between 0 and 1, where 1 indicates identical structure.
    """
    pos_seq1 = get_pos_sequence(sentence1)
    pos_seq2 = get_pos_sequence(sentence2)

    # Compute the edit distance between the two POS tag sequences.
    distance = edit_distance(pos_seq1, pos_seq2)

    # Normalize the distance by the length of the longer sequence.
    max_len = max(len(pos_seq1), len(pos_seq2))
    normalized_distance = distance / max_len if max_len != 0 else 0

    # A lower normalized distance means higher similarity.
    similarity = 1 - normalized_distance
    return round(similarity, 2)


In [14]:
df["syntactic_score"] = df.apply(
    lambda row: syntactic_similarity(row["question_forget"], row["question_retain"]),
    axis=1
)


In [16]:
print("====== Semantic Similarity ======")
print(df["similarity_score"].describe())

print("\n====== Syntactic Similarity ======")
print(df["syntactic_score"].describe())

count    1803.000000
mean        0.221431
std         0.116946
min        -0.060000
25%         0.140000
50%         0.210000
75%         0.300000
max         0.720000
Name: similarity_score, dtype: float64

count    1803.000000
mean        0.380593
std         0.124992
min         0.100000
25%         0.300000
50%         0.360000
75%         0.450000
max         1.000000
Name: syntactic_score, dtype: float64


### taking above the syntactic and similarity score mean

In [25]:
semantic_mean = df["similarity_score"].mean()  # 0.221431
syntactic_mean = df["syntactic_score"].mean()  # 0.380593

In [26]:
# Filter rows where either condition is met
retain_mean = df[
    (df["similarity_score"] > semantic_mean) | (df["syntactic_score"] > syntactic_mean)
][["title","question_retain", "answer_retain"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [27]:
retain_mean = retain_mean.drop_duplicates(subset=['question'])
retain_mean.shape

(345, 3)

In [29]:
retain_mean.to_csv("retain_mean.csv", index=False)

###  removing highly syntactic and semantically similar

In [None]:
semantic_75 = df["similarity_score"].quantile(0.75)  # 0.30
syntactic_75 = df["syntactic_score"].quantile(0.75)  # 0.45

In [42]:
retain_75 = df[
    (df["similarity_score"] < semantic_75.item()) | (df["syntactic_score"] < syntactic_75.item())
][["title","question_retain", "answer_retain", "similarity_score", "syntactic_score"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [39]:
something = df[
    (df["similarity_score"] > semantic_75.item()) | (df["syntactic_score"] > syntactic_75.item())
][["title","question_retain", "answer_retain", "similarity_score", "syntactic_score"]].rename(columns={"question_retain": "question", "answer_retain": "answer"})

In [40]:
something = something.drop_duplicates(subset=['question'])
something.shape

(280, 5)

In [41]:
something.head()

Unnamed: 0,title,question,answer,similarity_score,syntactic_score
0,Benedetto Varchi,In which Italian region is Montevarchi located?,Tuscany,0.41,0.5
7,Benedetto Varchi,Who did Lorenzino de' Medici assassinate?,"Alessandro de' Medici, Duke of Florence",0.35,0.38
10,Benedetto Varchi,What happened to Lorenzino de' Medici in 1548?,He was murdered in retaliation for assassinati...,0.31,0.3
13,Benedetto Varchi,What industries contributed to Montevarchi's g...,Agricultural trade and its wool and silk indus...,0.39,0.42
14,Benedetto Varchi,What roles did Lorenzino de' Medici serve in?,"Politician, writer, and dramatist",0.35,0.6


In [43]:
retain_75 = retain_75.drop_duplicates(subset=['question'])
retain_75.shape

(363, 5)

In [44]:
retain_75.head()

Unnamed: 0,title,question,answer,similarity_score,syntactic_score
1,Benedetto Varchi,Which Italian dialect served as the basis for ...,Florentine dialect,0.27,0.45
2,Benedetto Varchi,When did Ezra Pound begin writing The Cantos?,1915,0.17,0.33
3,Benedetto Varchi,How many sections are in The Cantos?,120,0.04,0.38
4,Benedetto Varchi,What was the name of the council that ruled th...,Signoria of Florence,0.21,0.36
5,Benedetto Varchi,Who was the first member of the Medici family ...,Cosimo de' Medici,0.29,0.33


In [22]:
retain_20_1.shape

(364, 3)

In [None]:

print(retain_75.shape)

(1803, 9)
(1649, 3)
