In [26]:
import re

import os
os.environ["WANDB_PROJECT"] = "PII Data Detection"
os.environ["WANDB_ENTITY"] = "deeppavlov_team"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_WATCH"] = "all"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from collections import defaultdict
from typing import Dict
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification, 
    AutoTokenizer, 
    EarlyStoppingCallback, 
    set_seed
)
from seqeval.metrics import (
    recall_score, 
    precision_score, 
    classification_report, 
    f1_score
)
from scipy.special import softmax
import torch
from pathlib import Path
import wandb
import json
from tqdm import tqdm
import logging

# from utils import visualize_ents, apply_threshold, tokens2text
from utils import *

random_seed = 42
set_seed(random_seed)

EPOCHS = 1
MAX_LENGTH = 4000

# wandb_run_name = f"deberta-base-{MAX_LENGTH}_orig_train+mpware_mixtral8x7b_v1.1-fixed-tokenization"
wandb_run_name = f"deberta-base-{MAX_LENGTH}-orig-train+rewritten-train-rand-ents-len-250+-all-ents-present-fixed-tokenization"
# wandb_run_name = f"deberta-base-{MAX_LENGTH}-rand-ents"
# wandb_run_name = f"deberta-base-{MAX_LENGTH}-fixed-tokenization"
# wandb_run_name = "deleteme"
model_save_path = f"/archive/savkin/models/ner/PII Data Detection/{wandb_run_name}"

# Load dataset

### Load original train texts

In [27]:
orig_data_path = "/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train_custom_split.json"
allowed_cols = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'valid']

df = pd.read_json(orig_data_path)[allowed_cols] \
       .rename(columns={"full_text": "text"}) \
       .astype({'document': 'str'}) \
       .agg(add_ner_tags, axis=1)
df["has_ents"] = df['labels'].apply(lambda labels: len(set(labels)) > 1)

orig_train_df = df[df["valid"] == False].reset_index(drop=True).drop(columns=["valid"])
orig_valid_df = df[df["valid"] == True].reset_index(drop=True).drop(columns=["valid"])
orig_valid_df.head(1)

Unnamed: 0,document,text,tokens,trailing_whitespace,labels,ner_tags,has_ents
0,8593,Tony Flores | Assignment on Reflection – Learn...,"[Tony, Flores, |, Assignment, on, Reflection, ...","[True, True, True, True, True, True, True, Tru...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[2, 9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,...",True


In [28]:
# Here we combine tokens into entities and create a dict of all entities in the text

orig_train_df = orig_train_df.agg(gen_ent_dict, axis=1)
orig_valid_df = orig_valid_df.agg(gen_ent_dict, axis=1)

a = orig_valid_df["labels"].apply(lambda x: set([remove_bio(l) for l in x]))
b = orig_valid_df["true_ents_dict"].apply(lambda x: set(x.keys()).union(set("O")))
assert (a == b).prod() == 1

for _, row in orig_train_df[orig_train_df["has_ents"]].iterrows():
    for ents in row["true_ents_dict"].values():
        for ent_text in ents:
            assert ent_text in row["text"]

### Load generated texts

In [29]:
# texts_load_path = Path("/archive/savkin/parsed_datasets/NER/PII_Data_Detection/mpware_mixtral8x7b_v1.1.json")

# with open(texts_load_path, "r") as file:
#     data = json.load(file)
#     generated_texts_df = pd.DataFrame().from_records(data)
#     generated_texts_df = generated_texts_df.rename(columns={"generated_text": "text", "full_text": "text"})
#     generated_texts_df = generated_texts_df.agg(add_ner_tags, axis=1)

In [30]:
texts_load_path = Path("/archive/savkin/parsed_datasets/PII/generated_texts/rewriting/mixtral-8x7B-instruct-v0.1-GPTQ-rewrite-train-essays.json")

with open(texts_load_path, "r") as file:
    data = json.load(file)
    generated_texts_df = pd.DataFrame().from_records(data)
    generated_texts_df = generated_texts_df.rename(columns={"generated_text": "text", "full_text": "text"})
    generated_texts_df = generated_texts_df.agg(add_ner_tags, axis=1)

Remove short text and texts with missing entitites

In [31]:
long_texts_mask = generated_texts_df["tokens"].apply(len) > 250
all_ents_present_mask = generated_texts_df[list(UNIQUE_CLASS_LABELS)].apply(lambda row: len([v for k, v in row.items() if v == True]) == 0, axis=1)
generated_texts_df = generated_texts_df[long_texts_mask & all_ents_present_mask].reset_index(drop=True)

In [32]:
if "true_ents_dict" not in generated_texts_df.columns:
  PII_ENTS = [
    ("name", "NAME_STUDENT", "James Brown"), # обрати внимание ФИО это одна сущность или несколько !!!!
    ("email", "EMAIL", "example@email.com"),
    ("personal_url", "URL_PERSONAL", "https://example.com"),
    ("username", "USERNAME", "john42"),
    ("address", "STREET_ADDRESS", "221B, Baker Street, London"),
    ("phone_num", "PHONE_NUM", "+1 212 555 0188"),
    ("userid", "ID_NUM", "123456789")
  ]

  LABEL2ENT = {l: e for _, l, e in PII_ENTS}

  def add_label_dict(row):
      row["true_ents_dict"] = {label: [LABEL2ENT[label]] for label in UNIQUE_CLASS_LABELS if row[label] is not None}
      return row

  generated_texts_df = generated_texts_df.agg(add_label_dict, axis=1)
  print(generated_texts_df.iloc[0])

prompt                            <s>[INST] Assignment: Mindmapping\n\nChallenge...
sampling_params                   {'max_tokens': 4000, 'n': 1, 'temperature': 1,...
text                               Assignment: Mindmapping\n\nChallenge\n\nThe c...
generated_text_with_ent_labels     Assignment: Mindmapping\n\nChallenge\n\nThe c...
vizualization                     {'_type': 'html-file', 'path': 'media/html/2c0...
tokens                            [ , Assignment, :, Mindmapping, \n\n, Challeng...
trailing_whitespace               [False, False, True, False, False, False, Fals...
labels                            [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
STREET_ADDRESS                                                                 None
PHONE_NUM                                                                      None
NAME_STUDENT                                                                  False
USERNAME                                                                    

Load generated entities

In [33]:
ents_load_path = "/archive/savkin/parsed_datasets/PII/generated_entities/faker_100k.csv"

generated_ents_df = pd.read_csv(ents_load_path).drop(columns="COUNTRY")
generated_ents_combs_df = generated_ents_df.applymap(lambda x: [x]).agg(lambda row: row.to_dict(), axis=1).reset_index(drop=True)
generated_ents_combs_df.iloc[0]

{'ID_NUM': ['140365767YDRHKXNlxoyrTOaI'],
 'NAME_STUDENT': ['Todd Hobbs'],
 'EMAIL': ['todd.hobbs@zavala.com'],
 'USERNAME': ['todd_hobbs204'],
 'PHONE_NUM': ['498-307-8349'],
 'URL_PERSONAL': ['https://youtube.com/c/todd_hobbs204'],
 'STREET_ADDRESS': ['1867 Hicks Points Paulmouth, IA 61051']}

## Dataset construction

Here we create a function, which takes a set of texts and generated entities and returns dataset with all entities replace by a random generated ones 

In [34]:
def replace_ents_in_generated_text(texts_df, generated_ents_combs_df):
    n_unique_ent_combs = len(generated_ents_combs_df)
    n_ent_combs = len(texts_df)

    rand_comb_indexes = np.random.randint(n_unique_ent_combs, size=n_ent_combs)

    texts_df["label2ent"] = pd.Series(generated_ents_combs_df.to_numpy()[rand_comb_indexes])

    texts_df = texts_df.agg(replace_ents_with_labels, axis=1) \
                       .agg(tokenize_df_with_spacy, axis=1) \
                       .agg(mark_ent_label_tokens, axis=1) \
                       .agg(replace_labels_with_ents, axis=1) \
                       .agg(add_ner_tags, axis=1)
    
    return texts_df

In [35]:
def construct_rand_dataset(orig_train_df, generated_texts_df, generated_ents_combs_df, epochs=EPOCHS):

    train_df = pd.DataFrame()
    cols = ["tokens", "trailing_whitespace", "labels", "ner_tags"]

    print("Replacing ents with random ones:")
    for epoch in tqdm(range(epochs)):
        # orig_train_df_rand_ents = replace_ents_in_generated_text(orig_train_df, generated_ents_combs_df)[cols]
        generated_texts_df_rand_ents = replace_ents_in_generated_text(generated_texts_df, generated_ents_combs_df)[cols]

        train_df_rand_ents = pd.concat([orig_train_df[cols], generated_texts_df_rand_ents]).reset_index(drop=True)
        train_df = pd.concat([train_df, train_df_rand_ents])

    return train_df.reset_index(drop=True)

# Replace non-pii ents with explicit labels

## Regexp ent detection

Email

In [36]:
pattern_emails = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

for tokens, labels in zip(df["tokens"], df["labels"]):
    for t, l in zip(tokens, labels):
        match = re.match(pattern_emails, t)

        a = (l == "B-EMAIL")
        b = (match is not None)

        if a and not b:
            print("FP: ", t)
        
        if not a and b:
            print("TN: ", t)

TN:  vpi@mn.nl


Url

In [37]:
pattern_url = r'h?t?tps?.\/\/.+'

all_new_labels = []
for i, (tokens, labels) in enumerate(zip(df["tokens"], df["labels"])):
    new_labels = []
    for t, l in zip(tokens, labels):
        match = re.match(pattern_url, t)
        
        a = (l == "B-URL_PERSONAL")
        b = (match is not None)

        new_labels.append("B-URL_PERSONAL_NON_PII" if (l == "O" and match is not None) else l)

        if a and not b:
            print("FP: ", t)
        
        if not a and b:
            print("TN: ", t)
    all_new_labels.append(new_labels)

df["non_pii_labels"] = all_new_labels

TN:  https://marketingplatform.google.com/about/analytics/terms/us/
TN:  https://fujifilm.com/fbglobal/eng/common/privacy_policy
TN:  https://policies.google.com/technologies/partner-sites
TN:  https://youtu.be/fPAnKZjejmU
TN:  https://miro.com
TN:  https://www.coursera.org/learn/uva-darden-design-thinking-innovation/lecture/F04r6/learning-launch-tool
TN:  https://www.coursera.org/learn/uva-darden-design-thinking-innovation/lecture/F04r6/learning-launch-tool
TN:  https://www.coursera.org/learn/uva-darden-design-thinking-innovation/lecture/UtntA/what-wows-what-works
TN:  http://www.tomfanelli.com/how-to-whiteboard-5-simple-whiteboard-techniques/
TN:  https://www.youtube.com/watch?v=NuacmGHLDqw
TN:  http://designresearchtechniques.com/casestudies/5-whys/
TN:  https://dschool.stanford.edu/wp-content/themes/dschool/method-cards/empathy-map.pdf
TN:  https://www.designorate.com/the-role-of-storytelling-in-the-design-process/
TN:  https://www.coursera.org/learn/uva-darden-design-thinking-inno

Phone

In [38]:
pattern_phone = r'(?:\(\d{3}\)\d{3}-\d{4})|(?:\d{3}\.\d{3}\.\d{4})'

for tokens, labels in zip(df["tokens"], df["labels"]):
    for t, l in zip(tokens, labels):
        match = re.match(pattern_phone, t)
        
        a = (l == "B-PHONE_NUM")
        b = (match is not None)

        if a and not b:
            print("FP: ", t)
        
        if not a and b:
            print("TN: ", t)

FP:  (
FP:  (
FP:  (
FP:  (
FP:  (


Name

In [39]:
spanish_female = pd.read_csv("/home/korzanova/pii/data_analysis/spanish-names-surnames/female_names.csv")
spanish_male = pd.read_csv("/home/korzanova/pii/data_analysis/spanish-names-surnames/male_names.csv")
arabic_female = pd.read_csv("/home/korzanova/pii/data_analysis/ArabicNameGenderFinder/females_en.csv")
arabic_male = pd.read_csv("/home/korzanova/pii/data_analysis/ArabicNameGenderFinder/males_en.csv")
indian_female = pd.read_csv("/home/korzanova/pii/data_analysis/Dataset-indian-names/Indian-Female-Names.csv")
indian_male = pd.read_csv("/home/korzanova/pii/data_analysis/Dataset-indian-names/Indian-Male-Names.csv")
surnames = pd.read_csv("/home/korzanova/pii/data_analysis/surnames.csv")

names = list(spanish_female['name'].unique())
names.extend(list(spanish_male["name"].unique()))
names.extend(list(arabic_female["Name"].unique()))
names.extend(list(arabic_male["Name"].unique()))
names.extend(list(indian_female["name"].unique()))
names.extend(list(indian_male["name"].unique()))
# names.extend(list(surnames["name"].unique()))

names = set([n.lower() for n in names if isinstance(n, str)])
len(names)

66361

In [40]:
all_pred_labels = []
all_new_labels = []
for tokens, labels in zip(df["tokens"], df["labels"]):
    pred_labels = []
    new_labels = []
    for i, (t, l) in enumerate(zip(tokens, labels)):
        if len(t) > 3 and t[0].isupper() and t[1].islower() and t.lower() in names:
            if len(pred_labels) == 0 or pred_labels[-1] == "O":
                pred_labels.append("B-NAME_STUDENT")
            else:
                pred_labels.append("I-NAME_STUDENT")
        else:
            pred_labels.append("O")

        pred_l = pred_labels[-1]

        new_labels.append(f"{pred_l}_NON_PII" if (l == "O" and pred_l != "O") else l)

        for tag in ["B-NAME_STUDENT", "I-NAME_STUDENT"]:

            a = (l == tag)
            b = (pred_l == tag)

            if a and not b:
                print("FP: ", t)
                # raise Exception
            
            if not a and b:
                print("TN: ", t)
                # raise Exception
    all_pred_labels.append(pred_labels)
    all_new_labels.append(new_labels)

df["non_pii_labels"] = all_new_labels

TN:  June
FP:  Elnemr
TN:  Milan
TN:  Milan
TN:  Dennis
TN:  India
TN:  Princess
TN:  Puerto
TN:  Rico
FP:  Panagiotopoulou
FP:  Panagiotopoulou
FP:  Jensen
TN:  George
FP:  Ana
TN:  Sofia
FP:  Sofia
FP:  Rayo
TN:  Andre
TN:  Martin
TN:  America
FP:  Mazzoleni
TN:  Christian
TN:  Friedrich
FP:  Mazzoleni
TN:  Nairobi
TN:  Nairobi
TN:  Nairobi
TN:  July
FP:  Edjanio
FP:  Sousa
TN:  Paris
TN:  Major
TN:  Tony
TN:  Ramon
TN:  Allan
TN:  Collins
TN:  Ross
TN:  Tony
TN:  Ramon
TN:  Allan
TN:  Collins
TN:  Ross
TN:  Argentina
TN:  America
TN:  France
TN:  Latino
TN:  Latino
TN:  Justin
TN:  Diego
TN:  Felipe
TN:  Justin
TN:  America
TN:  France
TN:  Paris
TN:  Jill
TN:  Charity
TN:  India
TN:  India
TN:  Victor
TN:  India
TN:  India
FP:  Gonzales
TN:  Virgil
TN:  Steven
TN:  Isabel
FP:  Gonzales
FP:  Gonzales
FP:  Gonzales
FP:  Nowak
TN:  Just
FP:  Nweze
TN:  Stanley
FP:  Stanley
TN:  Kenya
FP:  Nweze
TN:  Stanley
FP:  Stanley
FP:  Cordova
TN:  Just
TN:  Comfort
FP:  Figo
TN:  Landing
TN:  M

# Metrics

In [41]:
seqeval_metrics = evaluate.load("seqeval")

def f5_score(precision, recall):
    return (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-100)

def compute_metrics_from_labels(predictions, labels):
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[ID_2_LABEL[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [ID_2_LABEL[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval_metrics.compute(predictions=true_predictions, references=true_labels)
    for label, scores in results.items():
        if "overall" not in label:
            precision = scores["precision"]
            recall = scores["recall"]
            results[label]["f5_score"] = f5_score(precision, recall)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    results["overall_f5_score"] = f5_score(precision, recall)

    return results


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)

    return compute_metrics_from_labels(predictions, labels)

In [42]:
x = []
for pred_labels in df["non_pii_labels"]:
    x.append([LABEL_2_ID[i] for i in pred_labels])

y = []
for pred_labels in df["labels"]:
    y.append([LABEL_2_ID[i] for i in pred_labels]) 

# x = [LABEL_2_ID[i] for pred_labels in all_pred_labels for i in pred_labels ]
# y = [LABEL_2_ID[i] for i in df["labels"]]

# compute_metrics_from_labels(x, y)

In [43]:
for x in df["non_pii_labels"]:
    for y in x:
        if "NON_PII" in y:
            print(y)

B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
I-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
I-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
I-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
I-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUDENT_NON_PII
B-NAME_STUD

In [44]:
save_path = "/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train_custom_split_non_pii.json"

df.to_json(save_path)

# Visualization

In [45]:
rows_with_non_pii = []
for _, row in df.iterrows():
    has_non_pii = len([l for l in row["labels"] if "NON_PII" in l]) > 0
    rows_with_non_pii.append(row)

    if has_non_pii:
        print(42)
        html = visualize_ents(row["tokens"], row["trailing_whitespace"], row["labels"])
        display_html(html)
        raise Exception

# Tokenize data

### Log metrics depending on the threshold

In [None]:
error_table = wandb.Table(dataframe=error_df)
run.summary["error_table"] = error_table

In [None]:
wandb.finish()