# 目的
spaCyのdisplaCyを用いて、文章とタグを可視化する

In [1]:
import polars as pl

import spacy
from spacy.tokens import Span
from spacy import displacy

In [2]:
# 一度だけ必要
#!poetry run python -m spacy download en_core_web_sm

In [3]:
# https://matplotlib.org/2.0.2/examples/color/named_colors.html
options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "aliceblue",
        "B-EMAIL": "forestgreen",
        "I-EMAIL": "greenyellow",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "slategrey",
        "I-URL_PERSONAL": "silver",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}

In [4]:
original_data = pl.read_csv(
    "../generate_text/make_base_essay_teacher_celebrity_100/generated_df.csv"
)

In [5]:
original_data[8]["STREET_ADDRESS"][0]

'Vicolo Raffaellino, 95 70024, Murgetta (BA)'

In [6]:
import ast

data = pl.read_csv(
    "../generate_text/make_base_essay_teacher_celebrity_100/pii_data_make_base_essay_teacher_celebrity_100_2.csv"
).with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

In [7]:
documents = data.get_column("document").to_list()
tokens = data.get_column("tokens").to_list()
full_texts = data.get_column("full_text").to_list()
labels = data.get_column("labels").to_list()

In [8]:
full_texts[0]

"In addressing the challenge of increasing the online visibility and user engagement on my personal website, https://mcclain.com/better/discover/donaldgarcia, I decided to apply the tool of Storytelling from Module 2 of our course, guided by our instructor, Kimberly Dudley. As someone deeply intrigued by the power of narrative to connect and communicate, the choice of Storytelling stood out for its ability to craft compelling messages that resonate with audiences. This tool, thoroughly explained in our video lectures, emphasizes the creation and sharing of stories to convey ideas in a more impactful manner.\n\nApplying Storytelling to my challenge involved initially understanding the story of my website and its purpose. I embarked on this by crafting a narrative that encapsulated the journey of its creation and the unique value it offers to visitors. This narrative was then integrated across various sections of the site, aiming to weave a coherent and engaging story that visitors could

In [9]:
import spacy
from spacy.tokens import Span, Doc
from spacy import displacy

# https://matplotlib.org/2.0.2/examples/color/named_colors.html
options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "aliceblue",
        "B-EMAIL": "forestgreen",
        "I-EMAIL": "greenyellow",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "slategrey",
        "I-URL_PERSONAL": "silver",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}


class PiiLabelRenderer:
    def __init__(
        self,
        documents: list[str],
        tokens: list[str],
        full_texts: list[str],
        labels: list[str],
        options: dict[str, dict[str, str]],
    ) -> None:
        self.documents = documents
        self.tokens = tokens
        self.full_texts = full_texts
        self.labels = labels
        self.options = options
        self.nlp = spacy.blank("en")

    def get_original_text(
        self, render_idx: int | None = None, documents_id: int | None = None
    ) -> str:
        if render_idx and documents_id:
            raise Exception("Don't Use Both render_idx and documents_id")
        if documents_id:
            render_idx = self.documents.index(documents_id)

        return self.full_texts[render_idx]

    def render(
        self, render_idx: int | None = None, documents_id: int | None = None
    ) -> None:
        if render_idx and documents_id:
            raise Exception("Don't Use Both render_idx and documents_id")
        if documents_id and render_idx is None:
            render_idx = self.documents.index(documents_id)
        if documents_id is None and render_idx:
            documents_id = self.documents[render_idx]
        if documents_id is None and render_idx is None:
            raise Exception("Please specify either render_idx or documents_id")

        print(f"render_idx: {render_idx}, documents_id: {documents_id}")

        doc = self.nlp(self.full_texts[render_idx])
        doc.ents = [
            Span(doc, idx, idx + 1, label=label)
            for idx, label in enumerate(self.labels[render_idx])
            if label != "O"
        ]

        displacy.render(doc, style="ent", jupyter=True, options=options)

In [10]:
PLR = PiiLabelRenderer(
    documents=documents,
    tokens=tokens,
    full_texts=full_texts,
    labels=labels,
    options=options,
)

In [11]:
PLR.render(render_idx=8)

render_idx: 8, documents_id: 1221555009


In [12]:
data[8]

document,full_text,tokens,trailing_whitespace,labels,token_map,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
i64,str,list[str],list[bool],list[str],str,str,str,str,str,str,str,str
1221555009,"""When faced wit…","[""When"", ""faced"", … "".""]","[true, true, … false]","[""O"", ""O"", … ""O""]","""[{'B-URL_PERSO…","""Jolanda Marino…","""jolanda_marino…","""jolanda.marino…","""HHdL.""","""0583868502""","""https://linked…","""Vicolo Raffael…"
