# 目的
- 全てのデータの文字列を、O以外のタグごとに並べる、mixtralを含める

In [1]:
import polars as pl
import ast

In [4]:
train = pl.read_json("../data/train.json")
moredata = pl.read_csv("../data/moredata_dataset_fixed.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)
pii_dataset = pl.read_csv("../data/pii_dataset_fixed.csv").with_columns(
    pl.col("tokens").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("trailing_whitespace").map_elements(lambda x: ast.literal_eval(x)),
    pl.col("labels").map_elements(lambda x: ast.literal_eval(x)),
)
mixtral = pl.read_json("../data/mixtral-8x7b-v1.json")

In [10]:
from tqdm.auto import tqdm


def make_token_df(data: pl.DataFrame) -> pl.DataFrame:
    B_NAME_STUDENT_token = []
    I_NAME_STUDENT_token = []
    B_EMAIL_token = []
    I_EMAIL_token = []
    B_USERNAME_token = []
    I_USERNAME_token = []
    B_ID_NUM_token = []
    I_ID_NUM_token = []
    B_PHONE_NUM_token = []
    I_PHONE_NUM_token = []
    B_URL_PERSONAL_token = []
    I_URL_PERSONAL_token = []
    B_STREET_ADDRESS_token = []
    I_STREET_ADDRESS_token = []

    all_labels = [
        "B-NAME_STUDENT",
        "I-NAME_STUDENT",
        "B-EMAIL",
        "I-EMAIL",
        "B-USERNAME",
        "I-USERNAME",
        "B-ID_NUM",
        "I-ID_NUM",
        "B-PHONE_NUM",
        "I-PHONE_NUM",
        "B-URL_PERSONAL",
        "I-URL_PERSONAL",
        "B-STREET_ADDRESS",
        "I-STREET_ADDRESS",
    ]

    for tokens, labels in tqdm(zip(data["tokens"], data["labels"]), total=len(data)):
        for token, label in zip(tokens, labels):
            if label == "B-NAME_STUDENT":
                B_NAME_STUDENT_token.append(token)
            elif label == "I-NAME_STUDENT":
                I_NAME_STUDENT_token.append(token)
            elif label == "B-EMAIL":
                B_EMAIL_token.append(token)
            elif label == "I-EMAIL":
                I_EMAIL_token.append(token)
            elif label == "B-USERNAME":
                B_USERNAME_token.append(token)
            elif label == "I-USERNAME":
                I_USERNAME_token.append(token)
            elif label == "B-ID_NUM":
                B_ID_NUM_token.append(token)
            elif label == "I-ID_NUM":
                I_ID_NUM_token.append(token)
            elif label == "B-PHONE_NUM":
                B_PHONE_NUM_token.append(token)
            elif label == "I-PHONE_NUM":
                I_PHONE_NUM_token.append(token)
            elif label == "B-URL_PERSONAL":
                B_URL_PERSONAL_token.append(token)
            elif label == "I-URL_PERSONAL":
                I_URL_PERSONAL_token.append(token)
            elif label == "B-STREET_ADDRESS":
                B_STREET_ADDRESS_token.append(token)
            elif label == "I-STREET_ADDRESS":
                I_STREET_ADDRESS_token.append(token)

    max_len = max(
        len(B_NAME_STUDENT_token),
        len(I_NAME_STUDENT_token),
        len(B_EMAIL_token),
        len(I_EMAIL_token),
        len(B_USERNAME_token),
        len(I_USERNAME_token),
        len(B_ID_NUM_token),
        len(I_ID_NUM_token),
        len(B_PHONE_NUM_token),
        len(I_PHONE_NUM_token),
        len(B_URL_PERSONAL_token),
        len(I_URL_PERSONAL_token),
        len(B_STREET_ADDRESS_token),
        len(I_STREET_ADDRESS_token),
    )

    def pad_token(token: list[str]) -> list[str]:
        return token + [None for _ in range(max_len - len(token))]

    return pl.DataFrame(
        [
            pad_token(B_NAME_STUDENT_token),
            pad_token(I_NAME_STUDENT_token),
            pad_token(B_EMAIL_token),
            pad_token(I_EMAIL_token),
            pad_token(B_USERNAME_token),
            pad_token(I_USERNAME_token),
            pad_token(B_ID_NUM_token),
            pad_token(I_ID_NUM_token),
            pad_token(B_PHONE_NUM_token),
            pad_token(I_PHONE_NUM_token),
            pad_token(B_URL_PERSONAL_token),
            pad_token(I_URL_PERSONAL_token),
            pad_token(B_STREET_ADDRESS_token),
            pad_token(I_STREET_ADDRESS_token),
        ],
        orient="col",
        schema=all_labels,
    )

In [11]:
train_token_df = make_token_df(train)

100%|██████████| 6807/6807 [00:00<00:00, 7232.89it/s]


In [12]:
moredata_token_df = make_token_df(moredata)
moredata_token_df.columns = [
    "more_" + col_name for col_name in moredata_token_df.columns
]

100%|██████████| 2000/2000 [00:00<00:00, 9534.12it/s]


In [13]:
pii_dataset_token_df = make_token_df(pii_dataset)
pii_dataset_token_df.columns = [
    "pii_" + col_name for col_name in pii_dataset_token_df.columns
]

100%|██████████| 4434/4434 [00:00<00:00, 13493.53it/s]


In [14]:
mixtral_token_df = make_token_df(mixtral)
mixtral_token_df.columns = [
    "mixtral_" + col_name for col_name in mixtral_token_df.columns
]

  0%|          | 0/2355 [00:00<?, ?it/s]

100%|██████████| 2355/2355 [00:00<00:00, 5937.68it/s]


In [16]:
token_df = pl.concat(
    [train_token_df, moredata_token_df, pii_dataset_token_df, mixtral_token_df],
    how="horizontal",
)

In [18]:
token_df_column = []
for train_token, modedata_token, pii_dataset_token, mixtral_token in zip(
    train_token_df.columns,
    moredata_token_df.columns,
    pii_dataset_token_df.columns,
    mixtral_token_df.columns,
):
    token_df_column.append(train_token)
    token_df_column.append(modedata_token)
    token_df_column.append(pii_dataset_token)
    token_df_column.append(mixtral_token)

['B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-EMAIL',
 'I-EMAIL',
 'B-USERNAME',
 'I-USERNAME',
 'B-ID_NUM',
 'I-ID_NUM',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-URL_PERSONAL',
 'I-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [57]:
train_token_df

B-NAME_STUDENT,I-NAME_STUDENT,B-EMAIL,I-EMAIL,B-USERNAME,I-USERNAME,B-ID_NUM,I-ID_NUM,B-PHONE_NUM,I-PHONE_NUM,B-URL_PERSONAL,I-URL_PERSONAL,B-STREET_ADDRESS,I-STREET_ADDRESS
str,str,str,null,str,null,str,str,str,str,str,str,str,str
"""Nathalie""","""Sylla""","""djones@gmail.c…",,"""castanedagabri…",,"""860632713425""","""30407059""","""(""","""320)202""","""https://www.ja…","""nYZqnhEXw""","""591""","""Smith"""
"""Nathalie""","""Sylla""","""matthew72@hotm…",,"""fdixon""",,"""530670102508""",,"""(""","""-""","""https://www.li…",,"""743""","""Centers"""
"""Nathalie""","""Sylla""","""belindarojas@y…",,"""fdixon""",,"""530670102508""",,"""(""","""0688x95843""","""https://youtu.…",,,"""Apt"""
"""Diego""","""Estrada""","""kennethevans@h…",,"""meyermichelle""",,"""875673967537""",,"""(""","""223)392""","""https://www.ha…",,,"""."""
"""Diego""","""Estrada""","""agood@gmail.co…",,"""jacob59""",,"""860632713425""",,"""(""","""-""","""http://www.bur…",,,"""656"""
"""Gilberto""","""Gamboa""","""agood@gmail.co…",,"""holmespatrick""",,"""557349702179""",,"""410.526.1667""","""2765""","""http://jacobs-…",,,""" """
"""Sindy""","""Samaca""","""hwillis@gmail.…",,,,"""784372734211""",,,"""820)913""","""https://www.yo…",,,"""Joshuamouth"""
"""Nadine""","""Born""","""kellyharrison@…",,,,"""054176622314""",,,"""-""","""tps://www.face…",,,""","""
"""Eladio""","""Amaya""","""kellyharrison@…",,,,"""674915248960""",,,"""3241x894""","""https://www.yo…",,,"""RI"""
"""Silvia""","""Villalobos""","""lowetyler@hotm…",,,,"""932353568953""",,,"""820)913""","""https://oconne…",,,"""95963"""
