In [None]:
import wandb
import json
from pathlib import Path
import pandas as pd
import random
import numpy as np

import spacy
from spacy.tokens import Span, Doc
from spacy import displacy
from spacy.lang.en import English

from IPython.core.display import display, HTML

from utils import replace_ents_with_labels, \
                  replace_labels_with_ents, \
                  mark_ent_label_tokens, \
                  tokenize_df_with_spacy, \
                  visualize_ents, \
                  UNIQUE_LABELS 

# Load generated texts and entities

In [None]:
texts_load_path = Path("/archive/savkin/parsed_datasets/PII/generated_texts/rewriting/mixtral-8x7B-instruct-v0.1-GPTQ-rewrite-train-essays.json")

with open(texts_load_path, "r") as file:
    data = json.load(file)
    generated_texts_df = pd.DataFrame().from_records(data)
generated_texts_df.head(2)

In [None]:
load_path = "/home/savkin/2024/PII_Data_Detection/pii/generated_datasets/faker_10k.csv"

generated_ents_df = pd.read_csv(load_path).drop(columns="COUNTRY")
generated_ents_combs_df = generated_ents_df.applymap(lambda x: [x]).agg(lambda row: row.to_dict(), axis=1).reset_index(drop=True)
generated_ents_combs_df.head(2)

# Replace entity-holders with new entities

Create true_ents_dict if absent

In [None]:
PII_ENTS = [
  ("name", "NAME_STUDENT", "James Brown"), # обрати внимание ФИО это одна сущность или несколько !!!!
  ("email", "EMAIL", "example@email.com"),
  ("personal_url", "URL_PERSONAL", "https://example.com"),
  ("username", "USERNAME", "john42"),
  ("address", "STREET_ADDRESS", "221B, Baker Street, London"),
  ("phone_num", "PHONE_NUM", "+1 212 555 0188"),
  ("userid", "ID_NUM", "123456789")
]

LABEL2ENT = {l: e for _, l, e in PII_ENTS}

def add_label_dict(row):
    row["true_ents_dict"] = {label: [LABEL2ENT[label]] for label in UNIQUE_LABELS if row[label] is not None}
    return row

generated_texts_df = generated_texts_df.agg(add_label_dict, axis=1)

Select random fake data-rows

In [None]:
n_unique_ent_combs = len(generated_ents_combs_df)
n_ent_combs = len(generated_texts_df)

rand_comb_indexes = np.random.randint(n_unique_ent_combs, size=n_ent_combs)

generated_texts_df["label2ent"] = pd.Series(generated_ents_combs_df.to_numpy()[rand_comb_indexes])
generated_texts_df["label2ent"].head(2)

Replace labels with ents

In [None]:
if 'label2position' in generated_texts_df.columns:
    generated_texts_df = generated_texts_df.agg(replace_labels_with_ents, axis=1)
else:
    generated_texts_df = generated_texts_df.agg(replace_ents_with_labels, axis=1) \
                                           .agg(tokenize_df_with_spacy, axis=1) \
                                           .agg(mark_ent_label_tokens, axis=1) \
                                           .agg(replace_labels_with_ents, axis=1)

generated_texts_df.head(2)

In [None]:
row = generated_texts_df.iloc[3]
html = visualize_ents(row["tokens"], row["trailing_whitespace"], row["labels"])
display(HTML(html))

# Saving

In [None]:
save_path = f"{texts_load_path.name}_" 
generated_texts_df.to_records(,index=False)