# 目的
以下を計算する
- 外部データセットのラベル数

In [1]:
import polars as pl
import matplotlib.pyplot as plt
import ast

import itertools
from collections import Counter

In [2]:
train = pl.read_json("../data/train.json")
pii_dataset = pl.read_csv("../data/external_pii_dataset.csv")
pii_dataset_fixed = pl.read_csv("../data/pii_dataset_fixed.csv")
moredata_dataset_fixed = pl.read_csv("../data/moredata_dataset_fixed.csv")

In [3]:
pii_dataset = pii_dataset.with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

In [4]:
pii_dataset_fixed = pii_dataset_fixed.with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

In [5]:
moredata_dataset_fixed = moredata_dataset_fixed.with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

# Visualization

In [6]:
import spacy
from spacy.tokens import Span, Doc
from spacy import displacy

# https://matplotlib.org/2.0.2/examples/color/named_colors.html
options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "aliceblue",
        "B-EMAIL": "forestgreen",
        "I-EMAIL": "greenyellow",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "slategrey",
        "I-URL_PERSONAL": "silver",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}


class PiiLabelRenderer:
    def __init__(
        self,
        documents: list[str],
        tokens: list[str],
        labels: list[str],
        options: dict[str, dict[str, str]],
    ) -> None:
        self.documents = documents
        self.tokens = tokens
        self.labels = labels
        self.options = options
        self.nlp = spacy.blank("en")

    def render(
        self, render_idx: int | None = None, documents_id: int | None = None
    ) -> None:
        if render_idx and documents_id:
            raise Exception("Don't Use Both render_idx and documents_id")
        if documents_id:
            render_idx = self.documents.index(documents_id)

        doc = Doc(self.nlp.vocab, words=self.tokens[render_idx])
        # エンティティをDocに追加
        ents = []
        label_seq: list[str] = self.labels[render_idx]
        for i, label in enumerate(label_seq):
            if label == "O":
                continue
            ents.append(Span(doc, start=i, end=i + 1, label=label))

        doc.ents = ents

        displacy.render(doc, style="ent", jupyter=True, options=options)

In [8]:
documents = pii_dataset_fixed.get_column("document").to_list()
tokens = pii_dataset_fixed.get_column("tokens").to_list()
full_texts = pii_dataset_fixed.get_column("text").to_list()
labels = pii_dataset_fixed.get_column("labels").to_list()

In [9]:
PLR = PiiLabelRenderer(
    documents=documents, tokens=tokens, labels=labels, options=options
)

In [13]:
PLR.render(documents_id=documents[3])