# 目的
- O以外のタグについて、B,Iで結合した文字列を取得し、データフレームにする

In [3]:
import polars as pl
import ast
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train = pl.read_json("../data/train.json")
moredata = pl.read_csv("../data/moredata_dataset_fixed.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)
pii_dataset = pl.read_csv("../data/pii_dataset_fixed.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)
mixtral = pl.read_json("../data/mixtral-8x7b-v1.json")

mpware = pl.read_json("../data/mpware_mixtral8x7b_v1.1.json")

generate_text = pl.read_csv(
    "../make_base_essay_100/generated_pii_data.csv"
).with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

In [11]:
# B-で始まっていたら、tokenを文字列に追加、suffixを
# 次のtokenがI-で始まっている & 一つ前と同じ

In [5]:
def tokens_to_entities(tokens, labels):
    entities = []
    entity = []
    current_label = None
    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            # If there is an entity being built, add it to the entities list
            if entity:
                entities.append((" ".join(entity), current_label))
            # Start a new entity
            entity = [token]
            current_label = label[2:]  # Remove 'B-' from the label
        elif label.startswith("I-") and entity:
            # If the entity is being built, add the token to the entity
            entity.append(token)
    # Add the last entity
    if entity:
        entities.append((" ".join(entity), current_label))
    return entities


def make_token_df(data: pl.DataFrame) -> pl.DataFrame:
    NAME_STUDENT_token = []
    EMAIL_token = []
    USERNAME_token = []
    ID_NUM_token = []
    PHONE_NUM_token = []
    URL_PERSONAL_token = []
    STREET_ADDRESS_token = []

    all_labels = [
        "NAME_STUDENT",
        "EMAIL",
        "USERNAME",
        "ID_NUM",
        "PHONE_NUM",
        "URL_PERSONAL",
        "STREET_ADDRESS",
    ]

    for tokens, labels in tqdm(zip(data["tokens"], data["labels"]), total=len(data)):
        tokens_labels = tokens_to_entities(tokens, labels)
        for token, label in tokens_labels:
            if label == "NAME_STUDENT":
                NAME_STUDENT_token.append(token)
            elif label == "EMAIL":
                EMAIL_token.append(token)
            elif label == "USERNAME":
                USERNAME_token.append(token)
            elif label == "ID_NUM":
                ID_NUM_token.append(token)
            elif label == "PHONE_NUM":
                PHONE_NUM_token.append(token)
            elif label == "URL_PERSONAL":
                URL_PERSONAL_token.append(token)
            elif label == "STREET_ADDRESS":
                STREET_ADDRESS_token.append(token)

    max_len = max(
        len(NAME_STUDENT_token),
        len(EMAIL_token),
        len(USERNAME_token),
        len(ID_NUM_token),
        len(PHONE_NUM_token),
        len(URL_PERSONAL_token),
        len(STREET_ADDRESS_token),
    )

    def pad_token(token: list[str]) -> list[str]:
        return token + [None for _ in range(max_len - len(token))]

    return pl.DataFrame(
        [
            pad_token(NAME_STUDENT_token),
            pad_token(EMAIL_token),
            pad_token(USERNAME_token),
            pad_token(ID_NUM_token),
            pad_token(PHONE_NUM_token),
            pad_token(URL_PERSONAL_token),
            pad_token(STREET_ADDRESS_token),
        ],
        orient="col",
        schema=all_labels,
    )


make_token_df(train)

100%|██████████| 6807/6807 [00:01<00:00, 5714.26it/s]


NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
str,str,str,str,str,str,str
"""Nathalie Sylla…","""djones@gmail.c…","""castanedagabri…","""860632713425""","""( 320)202 - 06…","""https://www.ja…","""591 Smith Cent…"
"""Nathalie Sylla…","""matthew72@hotm…","""fdixon""","""530670102508""","""( 223)392 - 27…","""https://www.li…","""743 Erika Bypa…"
"""Nathalie Sylla…","""belindarojas@y…","""fdixon""","""530670102508""","""( 820)913 - 32…","""https://youtu.…",
"""Diego Estrada""","""kennethevans@h…","""meyermichelle""","""875673967537""","""( 820)913 - 32…","""https://www.ha…",
"""Diego Estrada""","""agood@gmail.co…","""jacob59""","""860632713425""","""( 820)913 - 32…","""http://www.bur…",
"""Gilberto Gambo…","""agood@gmail.co…","""holmespatrick""","""557349702179""","""410.526.1667""","""http://jacobs-…",
"""Sindy Samaca""","""hwillis@gmail.…",,"""784372734211""",,"""https://www.yo…",
"""Nadine Born""","""kellyharrison@…",,"""054176622314""",,"""tps://www.face…",
"""Eladio Amaya""","""kellyharrison@…",,"""674915248960""",,"""https://www.yo…",
"""Silvia Villalo…","""lowetyler@hotm…",,"""932353568953""",,"""https://oconne…",


In [6]:
train_token_df = make_token_df(train)

100%|██████████| 6807/6807 [00:01<00:00, 5727.64it/s]


In [7]:
moredata_token_df = make_token_df(moredata)
moredata_token_df.columns = [
    "more_" + col_name for col_name in moredata_token_df.columns
]

100%|██████████| 2000/2000 [00:00<00:00, 6380.10it/s]


In [8]:
pii_dataset_token_df = make_token_df(pii_dataset)
pii_dataset_token_df.columns = [
    "pii_" + col_name for col_name in pii_dataset_token_df.columns
]

100%|██████████| 4434/4434 [00:00<00:00, 10645.55it/s]


In [9]:
mixtral_token_df = make_token_df(mixtral)
mixtral_token_df.columns = [
    "mixtral_" + col_name for col_name in mixtral_token_df.columns
]

100%|██████████| 2355/2355 [00:00<00:00, 4544.54it/s]


In [10]:
mpware_token_df = make_token_df(mpware)
mpware_token_df.columns = ["mpware_" + col_name for col_name in mpware_token_df.columns]

100%|██████████| 2692/2692 [00:00<00:00, 5085.37it/s]


In [11]:
generate_text_token_df = make_token_df(generate_text)
generate_text_token_df.columns = [
    "generate_text_" + col_name for col_name in generate_text_token_df.columns
]

100%|██████████| 100/100 [00:00<00:00, 6197.72it/s]


In [12]:
token_df = pl.concat(
    [
        train_token_df,
        moredata_token_df,
        pii_dataset_token_df,
        mixtral_token_df,
        mpware_token_df,
        generate_text_token_df,
    ],
    how="horizontal",
)

In [13]:
token_df_column = []
for (
    train_token,
    modedata_token,
    pii_dataset_token,
    mixtral_token,
    mpware_token,
    generate_text_token,
) in zip(
    train_token_df.columns,
    moredata_token_df.columns,
    pii_dataset_token_df.columns,
    mixtral_token_df.columns,
    mpware_token_df.columns,
    generate_text_token_df.columns,
):
    token_df_column.append(train_token)
    token_df_column.append(modedata_token)
    token_df_column.append(pii_dataset_token)
    token_df_column.append(mixtral_token)
    token_df_column.append(mpware_token)
    token_df_column.append(generate_text_token)

In [14]:
# 並び替え
token_df[token_df_column].write_csv("output/concat_token_df_with_mpware_generate.csv")

In [24]:
tst = token_df[token_df_column]["ID_NUM"].map_elements(lambda x: len(x)).to_list()

In [27]:
from collections import Counter

Counter(tst)

Counter({None: 22885,
         12: 55,
         15: 5,
         16: 3,
         13: 2,
         19: 2,
         6: 2,
         9: 2,
         7: 2,
         8: 1,
         11: 1,
         10: 1,
         5: 1,
         18: 1})