# 目的
以下を計算する
- 外部データセットのラベル数

In [1]:
import polars as pl
import matplotlib.pyplot as plt
import ast

In [31]:
external_pii_dataset = pl.read_csv("../data/external_pii_dataset.csv")

In [32]:
external_pii_dataset = (
    external_pii_dataset.with_columns(
        pl.col("tokens").map_elements(lambda x: ast.literal_eval(x))
    )
    .with_columns(
        pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x))
    )
    .with_columns(pl.col("labels").map_elements(lambda x: ast.literal_eval(x)))
)

In [4]:
# スプシ用
external_pii_dataset.head(1000).write_csv("external_pii_dataset_1000.csv")

In [16]:
# labelsをlistに変換
external_pii_dataset = external_pii_dataset.with_columns(
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x))
)

In [17]:
# 全tagのリスト
import itertools
from collections import Counter

all_tags = list(
    itertools.chain.from_iterable(external_pii_dataset.get_column("labels").to_list())
)
all_tags_count = Counter(all_tags)
all_tags_count

Counter({'O': 1333514,
         'B-NAME_STUDENT': 11104,
         'I-STREET_ADDRESS': 8577,
         'I-NAME_STUDENT': 5667,
         'B-EMAIL': 3794,
         'B-STREET_ADDRESS': 3543,
         'I-PHONE_NUM': 3389,
         'B-PHONE_NUM': 2419,
         'B-USERNAME': 718,
         'B-URL_PERSONAL': 620})

In [78]:
import spacy
from spacy.tokens import Span, Doc
from spacy import displacy

# https://matplotlib.org/2.0.2/examples/color/named_colors.html
options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "aliceblue",
        "B-EMAIL": "forestgreen",
        "I-EMAIL": "greenyellow",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "slategrey",
        "I-URL_PERSONAL": "silver",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}


class PllLabelRenderer:
    def __init__(
        self,
        documents: list[str],
        tokens: list[str],
        labels: list[str],
        options: dict[str, dict[str, str]],
    ) -> None:
        self.documents = documents
        self.tokens = tokens
        self.labels = labels
        self.options = options
        self.nlp = spacy.blank("en")

    def render(
        self, render_idx: int | None = None, documents_id: int | None = None
    ) -> None:
        if render_idx and documents_id:
            raise Exception("Don't Use Both render_idx and documents_id")
        if documents_id:
            render_idx = self.documents.index(documents_id)

        doc = Doc(self.nlp.vocab, words=self.tokens[render_idx])
        # エンティティをDocに追加
        ents = []
        start = 0
        label_seq: list[str] = self.labels[render_idx]
        for i, label in enumerate(label_seq):
            if label.startswith("B-"):
                end = i + 1
                while end < len(label_seq) and label_seq[end].startswith("I-"):
                    end += 1
                ents.append(Span(doc, start=i, end=end, label=label[2:]))
            elif label == "O":
                start = i + 1

        doc.ents = ents

        displacy.render(doc, style="ent", jupyter=True, options=options)

In [73]:
documents = external_pii_dataset.get_column("document").to_list()
tokens = external_pii_dataset.get_column("tokens").to_list()
full_texts = external_pii_dataset.get_column("text").to_list()
labels = external_pii_dataset.get_column("labels").to_list()

In [79]:
PLR = PllLabelRenderer(
    documents=documents, tokens=tokens, labels=labels, options=options
)

In [80]:
PLR.render(documents_id=documents[0])

In [68]:
import spacy
from spacy.tokens import Span, Doc
from spacy import displacy

# テキストとエンティティラベルのリスト
# words = ['My', 'name', 'is', 'Aaliyah', 'Popova,', 'and']
# entities = ['O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'O']

words = tokens[0]
entities = labels[0]

# 空のSpacy Docを作成
nlp = spacy.blank("en")
doc = Doc(nlp.vocab, words=words)

# エンティティをDocに追加
ents = []
start = 0
for i, label in enumerate(entities):
    if label.startswith("B-"):
        end = i + 1
        while end < len(entities) and entities[end].startswith("I-"):
            end += 1
        ents.append(Span(doc, start=i, end=end, label=label[2:]))
    elif label == "O":
        start = i + 1

doc.ents = ents

# 可視化
displacy.render(doc, style="ent")

In [52]:
import spacy
from spacy.tokens import Span

# Load the spaCy model
nlp = spacy.blank("en")

# Your list of words
words = ["My", "name", "is", "Aaliyah", "Popova,", "and"]

# Create a spaCy Doc from the list of words
doc = spacy.tokens.Doc(nlp.vocab, words=words)

# Manually add the entities to the Doc
entities = [(3, 5, "NAME_STUDENT")]  # The entity spans from 'Aaliyah' to 'Popova,'
spans = [Span(doc, start, end, label=label) for start, end, label in entities]
doc.ents = spans

# Visualize the named entities
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

In [36]:
PLR = PllLabelRenderer(
    documents=documents, full_texts=full_texts, labels=labels, options=options
)