In [None]:
import wandb
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Add some statistic features to dataset

In [None]:
id2label = {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}

df = pd.read_json("/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train.json")

df["short_labels"] = df["labels"].apply(lambda labels: [l[2:] if l != "O" else "OTHER" for l in labels])
df["has_ents"] = df['labels'].apply(lambda labels: len(set(labels)) > 1)
df["tokens_cnt"] = df["labels"].apply(len)

unique_labels = set(id2label.values()) - set(["O"])
unique_short_labels = df["short_labels"].explode().unique()
for label in unique_labels:
    df[label] = df["labels"].apply(lambda labels: len([l for l in labels if l == label]))
for label in unique_short_labels:
    df[label] = df["short_labels"].apply(lambda labels: len([l for l in labels if l == label]))

In [None]:
for label in unique_short_labels:
    print(f"Documents count with label {label} -> {len(df[df[label] > 0])}")
# df.explode("short_labels").groupby("short_labels")["short_labels"].count()

In [None]:
df.explode("has_ents").groupby("has_ents")["has_ents"].count()

## Split dataset into train-valid

In [None]:
df = df.sample(frac=1, random_state=42)

df["valid"] = False

val_nums = {
    'PHONE_NUM': 4,
    'STREET_ADDRESS': 2,
    'USERNAME': 5,
    'EMAIL': 5,
    'ID_NUM': 10,
    'URL_PERSONAL': 20,
    'NAME_STUDENT': 200,
    'OTHER': 1500, 
}

# For each class in val_nums, randomly select the specified number of examples and set 'valid' to True
for label, num in val_nums.items():
    valid_indices = df[df[label] > 0].sample(n=num, replace=False, random_state=42).index
    x = len(df[df["valid"] == True])
    df.loc[valid_indices, 'valid'] = True


# Let's double check the classes per split:
for col, _ in val_nums.items():
    print(f'VALID {col}: {len(df[(df.valid == True) & (df[col] > 0)])}')
    print(f'TRAIN {col}: {len(df[(df.valid == False) & (df[col] > 0)])}')

## Render ents with spacy

In [None]:
import spacy
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.blank("en")

options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "skyblue",
        "B-EMAIL": "limegreen",
        "I-EMAIL": "lime",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "silver",
        "I-URL_PERSONAL": "lightgray",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}

def visualize(row):
    doc = nlp(row.full_text)
    doc.ents = [
        Span(doc, idx, idx + 1, label=label)
        for idx, label in enumerate(row.labels)
        if label != "O"
    ]
    html = displacy.render(doc, style="ent", jupyter=False, options=options)
    return html

In [None]:
from IPython.core.display import display, HTML

html = visualize(df.loc[0])
display(HTML(html))

## Add striding

In [None]:
stride = 512

def rebuild_text(tokens, trailing_whitespace):
    text = ''
    for token, ws in zip(tokens, trailing_whitespace):
        ws = " " if ws == True else ""
        text += token + ws
    return text

stride_df = df.copy(deep=True)
new_df = []
for _, row in df.iterrows():
    for start_tok_id in range(0, len(row["tokens"]), stride):
        if len(row["tokens"]) <= stride:
            new_df.append(row.to_dict())
            continue

        end_tok_id = start_tok_id + stride

        new_row = {}
        new_row['document'] = row['document']
        new_row['valid'] = row['valid']
        new_row['tokens'] = row["tokens"][start_tok_id:end_tok_id]
        new_row['trailing_whitespace'] = row['trailing_whitespace'][start_tok_id:end_tok_id]
        new_row['labels'] = row['labels'][start_tok_id:end_tok_id]
        
        

        token_id2pos = lambda id: sum([len(row["tokens"][i]) + int(row['trailing_whitespace'][i]) for i in range(0, id + 1)])
        doc_slice_start = token_id2pos(start_tok_id)
        doc_slice_end = token_id2pos(min(end_tok_id - 1, len(row["tokens"]) - 1))
        new_row['full_text'] = rebuild_text(new_row['tokens'], new_row['trailing_whitespace'])
        

        new_df.append(new_row)
split_df = pd.DataFrame(new_df)
split_df.head(10)

In [None]:
x = split_df[split_df["document"] == 9980].reset_index()
# print(x[3][-10:])
# print(x[4][10:])
# print(x[x["index"] == 3]["full_text"][-100:])
# print(x[x["index"] == 4]["full_text"][:100])
a  = x[x["index"] == 3]["full_text"][0]
b  = x[x["index"] == 4]["full_text"][1]

In [None]:
print(len(a))
print(len(b))
print(len(b)+len(a))
print(len(df[df["document"] == 9980]["full_text"][1158]))

In [None]:
a[0][-100:]

In [None]:
b[1][:100]

In [None]:
print(len(x["labels"][3]))
print(len(x["labels"][4]))

In [None]:
df[df["document"] == 9980]["full_text"][1158]

## Add visualization

In [None]:
df["visualization"] = [wandb.Html(visualize(row)) for _, row in df.iterrows()]
split_df["visualization"] = [wandb.Html(visualize(row)) for _, row in split_df.iterrows()]

## Log Dataset

In [None]:
wandb.init(project="PII Data Detection", job_type="load-data")

raw_data = wandb.Artifact(
    name="raw_dataset",
    type="dataset"
)

table = wandb.Table(data=df)
raw_data.add(table, "df_table")
wandb.log_artifact(raw_data)

# processed_data = wandb.Artifact(
#     name=f'stride_{stride}_dataset', 
#     type="dataset")
# processed_data.add_file(f'stride_{stride}_dataset.parquet')
# wandb.log_artifact(processed_data)