In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [2]:
from datasets import load_dataset

import pandas as pd

from similarities import print_similarities, compare_sentence_lists

from sentence_transformers import SentenceTransformer, util
from config import Config
import spacy
from nltk.metrics.distance import edit_distance
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModelForCausalLM
import torch

from template import LLAMA3_CHAT_TEMPLATE

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
cfg = Config()

### on wpu

In [3]:
forget_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1/train-00000-of-00001.parquet")
retain_20_1 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_20_1_hard_retain/train-00000-of-00001.parquet")
retain_general = load_dataset("Shiyu-Lab/Wikipedia_Person_Unlearn", "general_retain")
retain_general = pd.DataFrame(retain_general['train'])

In [4]:
forget_20_1.head()

Unnamed: 0,title,question,answer,paraphrased_question,wikipage
0,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...
1,Benedetto Varchi,What professions did Benedetto Varchi have?,"Humanist, historian, poet",What were the professions of Benedetto Varchi?,Benedetto Varchi (Italian pronunciation: [bene...
2,Benedetto Varchi,Where was Benedetto Varchi born?,Florence,In which city was Benedetto Varchi born?,Benedetto Varchi (Italian pronunciation: [bene...
3,Benedetto Varchi,Who commissioned Benedetto Varchi to write a h...,Cosimo I,Which ruler asked Benedetto Varchi to document...,Benedetto Varchi (Italian pronunciation: [bene...
4,Benedetto Varchi,When was Varchi's Storia fiorentina first publ...,1721,In what year was the Storia fiorentina by Bene...,Benedetto Varchi (Italian pronunciation: [bene...


In [5]:
retain_20_1.head()

Unnamed: 0,title,question,answer
0,Benedetto Varchi,In which Italian region is Montevarchi located?,Tuscany
1,Benedetto Varchi,Which Italian dialect served as the basis for ...,Florentine dialect
2,Benedetto Varchi,When did Ezra Pound begin writing The Cantos?,1915
3,Benedetto Varchi,How many sections are in The Cantos?,120
4,Benedetto Varchi,What was the name of the council that ruled th...,Signoria of Florence


In [6]:
df = pd.merge(
    forget_20_1,
    retain_20_1,
    on="title",
    how="inner",
    suffixes=("_forget", "_retain")
)

In [7]:
df.head()

Unnamed: 0,title,question_forget,answer_forget,paraphrased_question,wikipage,question_retain,answer_retain
0,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,In which Italian region is Montevarchi located?,Tuscany
1,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,Which Italian dialect served as the basis for ...,Florentine dialect
2,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,When did Ezra Pound begin writing The Cantos?,1915
3,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,How many sections are in The Cantos?,120
4,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,What was the name of the council that ruled th...,Signoria of Florence


In [8]:
device = 'cuda'

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
def compute_semantic_similarity(text1, text2):
    # Encode sentences
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    # Compute cosine similarity
    similarity = util.cos_sim(embedding1, embedding2)
    # Return scalar float
    return round(float(similarity[0][0]),2)

In [15]:
df["similarity_score"] = df.apply(
    lambda row: compute_semantic_similarity(
        row["question_forget"], 
        row["question_retain"]
    ),
    axis=1
)

In [23]:
def get_pos_sequence(sentence):
    """
    Parse the sentence and return its sequence of POS tags.
    """
    doc = nlp(sentence)
    return [token.pos_ for token in doc]

def syntactic_similarity(sentence1, sentence2):
    """
    Compute a syntactic similarity score based on the edit distance
    between the sequences of POS tags from two sentences.

    The score is normalized between 0 and 1, where 1 indicates identical structure.
    """
    pos_seq1 = get_pos_sequence(sentence1)
    pos_seq2 = get_pos_sequence(sentence2)

    # Compute the edit distance between the two POS tag sequences.
    distance = edit_distance(pos_seq1, pos_seq2)

    # Normalize the distance by the length of the longer sequence.
    max_len = max(len(pos_seq1), len(pos_seq2))
    normalized_distance = distance / max_len if max_len != 0 else 0

    # A lower normalized distance means higher similarity.
    similarity = 1 - normalized_distance
    return round(similarity, 2)


In [24]:
df["syntactic_score"] = df.apply(
    lambda row: syntactic_similarity(row["question_forget"], row["question_retain"]),
    axis=1
)


In [25]:
df.head()

Unnamed: 0,title,question_forget,answer_forget,paraphrased_question,wikipage,question_retain,answer_retain,similarity_score,syntactic_score
0,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,In which Italian region is Montevarchi located?,Tuscany,0.41,0.5
1,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,Which Italian dialect served as the basis for ...,Florentine dialect,0.27,0.45
2,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,When did Ezra Pound begin writing The Cantos?,1915,0.17,0.33
3,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,How many sections are in The Cantos?,120,0.04,0.38
4,Benedetto Varchi,What nationality was Benedetto Varchi?,Italian,Which country was Benedetto Varchi from?,Benedetto Varchi (Italian pronunciation: [bene...,What was the name of the council that ruled th...,Signoria of Florence,0.21,0.36


In [26]:
print("====== Semantic Similarity ======")
print(df["similarity_score"].describe())

print("\n====== Syntactic Similarity ======")
print(df["syntactic_score"].describe())

count    1803.000000
mean        0.221431
std         0.116946
min        -0.060000
25%         0.140000
50%         0.210000
75%         0.300000
max         0.720000
Name: similarity_score, dtype: float64

count    1803.000000
mean        0.380593
std         0.124992
min         0.100000
25%         0.300000
50%         0.360000
75%         0.450000
max         1.000000
Name: syntactic_score, dtype: float64


In [27]:
forget_20_1.to_csv("forget_20_1.csv", index=False)
retain_20_1.to_csv("retain_20_1.csv", index=False)