# 目的
- 全てのデータの文字列を、O以外のタグごとに並べる

In [31]:
import polars as pl
import ast

In [32]:
train = pl.read_json("../data/train.json")
moredata = pl.read_csv("../data/moredata_dataset_fixed.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)
pii_dataset = pl.read_csv("../data/external_pii_dataset.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)

In [33]:
from tqdm.auto import tqdm


def make_token_df(data: pl.DataFrame) -> pl.DataFrame:
    B_NAME_STUDENT_token = []
    I_NAME_STUDENT_token = []
    B_EMAIL_token = []
    I_EMAIL_token = []
    B_USERNAME_token = []
    I_USERNAME_token = []
    B_ID_NUM_token = []
    I_ID_NUM_token = []
    B_PHONE_NUM_token = []
    I_PHONE_NUM_token = []
    B_URL_PERSONAL_token = []
    I_URL_PERSONAL_token = []
    B_STREET_ADDRESS_token = []
    I_STREET_ADDRESS_token = []

    all_labels = [
        "B-NAME_STUDENT",
        "I-NAME_STUDENT",
        "B-EMAIL",
        "I-EMAIL",
        "B-USERNAME",
        "I-USERNAME",
        "B-ID_NUM",
        "I-ID_NUM",
        "B-PHONE_NUM",
        "I-PHONE_NUM",
        "B-URL_PERSONAL",
        "I-URL_PERSONAL",
        "B-STREET_ADDRESS",
        "I-STREET_ADDRESS",
    ]

    for tokens, labels in tqdm(zip(data["tokens"], data["labels"]), total=len(data)):
        for token, label in zip(tokens, labels):
            if label == "B-NAME_STUDENT":
                B_NAME_STUDENT_token.append(token)
            elif label == "I-NAME_STUDENT":
                I_NAME_STUDENT_token.append(token)
            elif label == "B-EMAIL":
                B_EMAIL_token.append(token)
            elif label == "I-EMAIL":
                I_EMAIL_token.append(token)
            elif label == "B-USERNAME":
                B_USERNAME_token.append(token)
            elif label == "I-USERNAME":
                I_USERNAME_token.append(token)
            elif label == "B-ID_NUM":
                B_ID_NUM_token.append(token)
            elif label == "I-ID_NUM":
                I_ID_NUM_token.append(token)
            elif label == "B-PHONE_NUM":
                B_PHONE_NUM_token.append(token)
            elif label == "I-PHONE_NUM":
                I_PHONE_NUM_token.append(token)
            elif label == "B-URL_PERSONAL":
                B_URL_PERSONAL_token.append(token)
            elif label == "I-URL_PERSONAL":
                I_URL_PERSONAL_token.append(token)
            elif label == "B-STREET_ADDRESS":
                B_STREET_ADDRESS_token.append(token)
            elif label == "I-STREET_ADDRESS":
                I_STREET_ADDRESS_token.append(token)

    max_len = max(
        len(B_NAME_STUDENT_token),
        len(I_NAME_STUDENT_token),
        len(B_EMAIL_token),
        len(I_EMAIL_token),
        len(B_USERNAME_token),
        len(I_USERNAME_token),
        len(B_ID_NUM_token),
        len(I_ID_NUM_token),
        len(B_PHONE_NUM_token),
        len(I_PHONE_NUM_token),
        len(B_URL_PERSONAL_token),
        len(I_URL_PERSONAL_token),
        len(B_STREET_ADDRESS_token),
        len(I_STREET_ADDRESS_token),
    )

    def pad_token(token: list[str]) -> list[str]:
        return token + [None for _ in range(max_len - len(token))]

    return pl.DataFrame(
        [
            B_NAME_STUDENT_token,
            pad_token(I_NAME_STUDENT_token),
            pad_token(B_EMAIL_token),
            pad_token(I_EMAIL_token),
            pad_token(B_USERNAME_token),
            pad_token(I_USERNAME_token),
            pad_token(B_ID_NUM_token),
            pad_token(I_ID_NUM_token),
            pad_token(B_PHONE_NUM_token),
            pad_token(I_PHONE_NUM_token),
            pad_token(B_URL_PERSONAL_token),
            pad_token(I_URL_PERSONAL_token),
            pad_token(B_STREET_ADDRESS_token),
            pad_token(I_STREET_ADDRESS_token),
        ],
        orient="col",
        schema=all_labels,
    )

In [34]:
train_token_df = make_token_df(train)

100%|██████████| 6807/6807 [00:14<00:00, 482.71it/s]


In [35]:
moredata_token_df = make_token_df(moredata)
moredata_token_df.columns = [
    "more_" + col_name for col_name in moredata_token_df.columns
]

100%|██████████| 2000/2000 [00:04<00:00, 453.68it/s]


In [36]:
pii_dataset_token_df = make_token_df(pii_dataset)
pii_dataset_token_df.columns = [
    "pii_" + col_name for col_name in pii_dataset_token_df.columns
]

100%|██████████| 4434/4434 [00:05<00:00, 818.58it/s]


In [37]:
pl.concat(
    [train_token_df, moredata_token_df, pii_dataset_token_df], how="horizontal"
).write_csv("token_df.csv")

In [39]:
moredata_token_df

more_B-NAME_STUDENT,more_I-NAME_STUDENT,more_B-EMAIL,more_I-EMAIL,more_B-USERNAME,more_I-USERNAME,more_B-ID_NUM,more_I-ID_NUM,more_B-PHONE_NUM,more_I-PHONE_NUM,more_B-URL_PERSONAL,more_I-URL_PERSONAL,more_B-STREET_ADDRESS,more_I-STREET_ADDRESS
null,null,null,null,null,null,null,null,null,null,null,null,null,null
