# Quantifying Entity Novelty
- Exposure (see Carlini et al. 2019)
- Factual Test
- Utilization of a music entities published after the knowledge cut


In [1]:
import pandas as pd
import os

path_exposure = "../data/intermediate/shs100k2_exposure.json"
path_memorizations = "../output/memorization2"

data_wd = pd.read_json(path_exposure, lines=True, orient="records")

data_perfs = pd.read_parquet("../data/raw/shs100k2_yt.parquet")[
    ["set_id", "title", "performer"]].groupby("set_id", as_index=False).agg(list)

__ = []
files = os.listdir(path_memorizations)
for f in files:
    __data = pd.read_json(os.path.join(path_memorizations, f), lines=True, orient="records")
    if "set_id" in __data.columns:
        __data = pd.merge(__data, data_perfs, how="left", on="set_id")
        __data["filename"] = f
        __.append(__data)

data = pd.concat(__)


In [2]:
def AW1_correct(row):
    if row.AW1 and row.artist_original:
        aws = row.AW1.split()
        return any([aw.lower() in [a.lower() for a in row.artist_original.split()] for aw in aws])
    return

def artist_partly_correct(row, answer_col: str):
    if row[answer_col] and row.performer:
        original_artists = row[answer_col].split()
        for artist in original_artists:
            for performer in row.performer:
                if artist in performer:
                    return True
    else: 
        return None
    return False

def AW2_correct(row):
    if row.AW2 and row.artist_perf:
        aws = row.AW2.split()
        return any([aw.lower() in [a.lower() for a in row.artist_perf.split()] for aw in aws])
    return

def AW3_correct(row):
    if row.AW3 and row.composer:
        aws = row.AW3.split()
        return any([aw.lower() in [c.lower() for c in row.composer.split()] for aw in aws])
    return


data["AW1_correct"] = data.apply(AW1_correct, axis=1)
data["AW1_related"] = data.apply(lambda x: artist_partly_correct(x, "AW1"), axis=1)

data["AW2_correct"] = data.apply(AW2_correct, axis=1)
data["AW2_related"] = data.apply(lambda x: artist_partly_correct(x, "AW2"), axis=1)

data["AW3_correct"] = data.apply(AW3_correct, axis=1)



In [7]:

id_cols = ['set_id', 'work_id', 'perf_id', 'filename']
aw_cols = [col for col in data.columns if col.startswith('AW') and "_" in col]

data[id_cols + aw_cols]

__data = data.melt(id_vars=id_cols, value_vars=aw_cols)

__data = __data.pivot_table(
    index=['set_id', 'work_id', 'perf_id'],
    columns=['filename', 'variable'],
    values='value',
    aggfunc='first'  # 'first' because we assume there is no aggregation needed if values are unique
)
__data.to_json("../data/intermediate/shs100k2_memorization.json", lines=True, orient="records")
__data


Unnamed: 0_level_0,Unnamed: 1_level_0,filename,llama3.1-70b.jsonl,llama3.1-70b.jsonl,llama3.1-70b.jsonl,llama3.1-70b.jsonl,llama3.1-70b.jsonl,llama3.1-8b.jsonl,llama3.1-8b.jsonl,llama3.1-8b.jsonl,llama3.1-8b.jsonl,llama3.1-8b.jsonl
Unnamed: 0_level_1,Unnamed: 1_level_1,variable,AW1_correct,AW1_related,AW2_correct,AW2_related,AW3_correct,AW1_correct,AW1_related,AW2_correct,AW2_related,AW3_correct
set_id,work_id,perf_id,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,4681.0,132979.0,False,True,False,True,True,False,True,False,True,True
1,7895.0,7895.0,True,True,True,True,True,True,True,True,True,True
2,7896.0,86970.0,True,False,False,False,True,True,False,False,False,True
3,12959.0,507152.0,False,True,False,True,True,False,True,False,True,True
4,312438.0,51032.0,True,True,True,True,False,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5202,131540.0,263949.0,,,,,,False,False,False,False,False
5203,133228.0,373461.0,,,,,,False,False,False,False,False
5204,134076.0,282922.0,,,,,,False,False,False,False,False
5204,134076.0,512656.0,,,,,,False,False,False,False,False


# Analysis