In [1]:
from datasets import load_dataset

import pandas as pd

from similarities import *

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### on wpu

In [2]:
forget_100 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_100/train-00000-of-00001.parquet")
hard_retain_100 = pd.read_parquet("hf://datasets/Shiyu-Lab/Wikipedia_Person_Unlearn/forget_100_hard_retain/train-00000-of-00001.parquet")
retain_general = load_dataset("Shiyu-Lab/Wikipedia_Person_Unlearn", "general_retain")
retain_general = pd.DataFrame(retain_general['train'])

In [3]:
questions = list(forget_100['question'][:7])
sentences_1 = list(hard_retain_100['question'][:20])

In [4]:
dt = list(retain_general.loc[retain_general['title'] == 'Donald Trump']['question'])
dt

['When was Donald Trump born?',
 'What was the name of the reality TV series Donald Trump co-produced and hosted?',
 'What year did Donald Trump win the presidential election?',
 "Who was Donald Trump's opponent in the 2016 presidential election?",
 'Name one of the Supreme Court justices appointed by Trump.']

In [5]:
paraphrased_questions = forget_100['paraphrased_question'][:7]

In [13]:
questions

['What nationality was Benedetto Varchi?',
 'What professions did Benedetto Varchi have?',
 'Where was Benedetto Varchi born?',
 'Who commissioned Benedetto Varchi to write a history of Florence?',
 "When was Varchi's Storia fiorentina first published in Florence?",
 'Which work of Ezra Pound mentions Benedetto Varchi?',
 "What was the main topic of Benedetto Varchi's Storia fiorentina?"]

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
q_embeddings = model.encode(questions)
s_embeddings = model.encode(sentences_1)

In [8]:
d_embeddings = model.encode(dt)

In [9]:
para_embeddings = model.encode(paraphrased_questions)

In [10]:
similarities = model.similarity(q_embeddings, s_embeddings)

In [11]:
print_similarities(questions, sentences_1, similarities)

What nationality was Benedetto Varchi?
 - In which Italian region is Montevarchi located?: 0.4114
 - Which Italian dialect served as the basis for Standard Italian?: 0.2685
 - When did Ezra Pound begin writing The Cantos?: 0.1661
 - How many sections are in The Cantos?: 0.0374
 - What was the name of the council that ruled the Republic of Florence?: 0.2098
 - Who was the first member of the Medici family to gain control over Florence?: 0.2883
 - In what year did the pope declare Cosimo the first grand duke of Tuscany?: 0.2319
 - Who did Lorenzino de' Medici assassinate?: 0.3539
 - What are the main subdivisions of Tuscan dialects?: 0.1533
 - What award did The Pisan Cantos win in 1948?: 0.3014
 - What happened to Lorenzino de' Medici in 1548?: 0.3118
 - Who was Piero Strozzi's father?: 0.2751
 - In which battle did Piero Strozzi suffer a defeat by Imperial-Spanish forces?: 0.1041
 - What industries contributed to Montevarchi's growth during the Medici rule?: 0.3888
 - What roles did Lo

In [12]:
d_sims = model.similarity(q_embeddings, d_embeddings)
print_similarities(questions, dt, d_sims)

What nationality was Benedetto Varchi?
 - When was Donald Trump born?   : 0.2123
 - What was the name of the reality TV series Donald Trump co-produced and hosted?: 0.0016
 - What year did Donald Trump win the presidential election?: 0.0790
 - Who was Donald Trump's opponent in the 2016 presidential election?: 0.0850
 - Name one of the Supreme Court justices appointed by Trump.: 0.0801
What professions did Benedetto Varchi have?
 - When was Donald Trump born?   : 0.1857
 - What was the name of the reality TV series Donald Trump co-produced and hosted?: 0.0188
 - What year did Donald Trump win the presidential election?: 0.0995
 - Who was Donald Trump's opponent in the 2016 presidential election?: 0.0766
 - Name one of the Supreme Court justices appointed by Trump.: 0.1255
Where was Benedetto Varchi born?
 - When was Donald Trump born?   : 0.3761
 - What was the name of the reality TV series Donald Trump co-produced and hosted?: 0.0133
 - What year did Donald Trump win the presidential 

In [13]:
para_sims = model.similarity(q_embeddings, para_embeddings)
print_similarities(questions, paraphrased_questions, para_sims)

What nationality was Benedetto Varchi?
 - Which country was Benedetto Varchi from?: 0.9733
 - What were the professions of Benedetto Varchi?: 0.8634
 - In which city was Benedetto Varchi born?: 0.9274
 - Which ruler asked Benedetto Varchi to document the history of Florence?: 0.7243
 - In what year was the Storia fiorentina by Benedetto Varchi published in Florence?: 0.6093
 - In which of Ezra Pound's works is Benedetto Varchi referenced?: 0.6111
 - What period does Benedetto Varchi's Storia fiorentina cover?: 0.6701
What professions did Benedetto Varchi have?
 - Which country was Benedetto Varchi from?: 0.8554
 - What were the professions of Benedetto Varchi?: 0.9781
 - In which city was Benedetto Varchi born?: 0.8312
 - Which ruler asked Benedetto Varchi to document the history of Florence?: 0.6840
 - In what year was the Storia fiorentina by Benedetto Varchi published in Florence?: 0.6052
 - In which of Ezra Pound's works is Benedetto Varchi referenced?: 0.5972
 - What period does B

syntactic similarity

In [17]:
# Load spaCy's English model (make sure to install it via: python -m spacy download en_core_web_sm)
import spacy
from nltk.metrics.distance import edit_distance
nlp = spacy.load("en_core_web_sm")

In [19]:
def get_pos_sequence(sentence):
    """
    Parse the sentence and return its sequence of POS tags.
    """
    doc = nlp(sentence)
    return [token.pos_ for token in doc]

def syntactic_similarity(sentence1, sentence2):
    """
    Compute a syntactic similarity score based on the edit distance
    between the sequences of POS tags from two sentences.

    The score is normalized between 0 and 1, where 1 indicates identical structure.
    """
    pos_seq1 = get_pos_sequence(sentence1)
    pos_seq2 = get_pos_sequence(sentence2)

    # Compute the edit distance between the two POS tag sequences.
    distance = edit_distance(pos_seq1, pos_seq2)

    # Normalize the distance by the length of the longer sequence.
    max_len = max(len(pos_seq1), len(pos_seq2))
    normalized_distance = distance / max_len if max_len != 0 else 0

    # A lower normalized distance means higher similarity.
    similarity = 1 - normalized_distance
    return similarity

def compare_sentence_lists(sentences_1, sentences_2):
    """
    Compare each sentence in sentences_1 to each sentence in sentences_2 and return
    a list of tuples containing the two sentences and their syntactic similarity score.
    """
    results = []
    for sent1 in sentences_1:
        for sent2 in sentences_2:
            score = syntactic_similarity(sent1, sent2)
            results.append((sent1, sent2, score))
    return results

In [20]:
syntactic_sim_qs1 = compare_sentence_lists(questions, sentences_1)

In [21]:
for sent1, sent2, score in syntactic_sim_qs1:
    print(f"Sentence 1: {sent1}")
    print(f"Sentence 2: {sent2}")
    print(f"Syntactic Similarity: {score:.2f}")
    print()


Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: In which Italian region is Montevarchi located?
Syntactic Similarity: 0.50

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: Which Italian dialect served as the basis for Standard Italian?
Syntactic Similarity: 0.45

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: When did Ezra Pound begin writing The Cantos?
Syntactic Similarity: 0.33

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: How many sections are in The Cantos?
Syntactic Similarity: 0.38

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: What was the name of the council that ruled the Republic of Florence?
Syntactic Similarity: 0.36

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: Who was the first member of the Medici family to gain control over Florence?
Syntactic Similarity: 0.33

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: In what year did the pope declare Cosimo the firs

In [67]:
syntactic_sim_qdt = compare_sentence_lists(questions, dt)
# Display the results.
for sent1, sent2, score in syntactic_sim_qdt:
    print(f"Sentence 1: {sent1}")
    print(f"Sentence 2: {sent2}")
    print(f"Syntactic Similarity: {score:.2f}")
    print()

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: When was Donald Trump born?
Syntactic Similarity: 0.50

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: What was the name of the reality TV series Donald Trump co-produced and hosted?
Syntactic Similarity: 0.29

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: What year did Donald Trump win the presidential election?
Syntactic Similarity: 0.50

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: Who was Donald Trump's opponent in the 2016 presidential election?
Syntactic Similarity: 0.33

Sentence 1: What nationality was Benedetto Varchi?
Sentence 2: Name one of the Supreme Court justices appointed by Trump.
Syntactic Similarity: 0.27

Sentence 1: What professions did Benedetto Varchi have?
Sentence 2: When was Donald Trump born?
Syntactic Similarity: 0.71

Sentence 1: What professions did Benedetto Varchi have?
Sentence 2: What was the name of the reality TV series Donald Trump co-pr