In [1]:
import polars as pl

In [2]:
train = pl.read_json("..//data/train.json")
train.head()

document,full_text,tokens,trailing_whitespace,labels
i64,str,list[str],list[bool],list[str]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]","[""O"", ""O"", … ""O""]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]","[""O"", ""O"", … ""O""]"


In [3]:
def get_has_label_seq(label: list[str]) -> str:
    # labelをuniqueにしてソート
    label_set = set(label)
    has_b_name_student = int("B-NAME_STUDENT" in label_set)
    has_i_name_student = int("I-NAME_STUDENT" in label_set)
    has_b_email = int("B-EMAIL" in label_set)
    has_i_email = int("I-EMAIL" in label_set)
    has_b_username = int("B-USERNAME" in label_set)
    has_i_username = int("I-USERNAME" in label_set)
    has_b_id_num = int("B-ID_NUM" in label_set)
    has_i_id_num = int("I-ID_NUM" in label_set)
    has_b_phone_num = int("B-PHONE_NUM" in label_set)
    has_i_phone_num = int("I-PHONE_NUM" in label_set)
    has_b_url_personal = int("B-URL_PERSONAL" in label_set)
    has_i_url_personal = int("I-URL_PERSONAL" in label_set)
    has_b_street_address = int("B-STREET_ADDRESS" in label_set)
    has_i_street_address = int("I-STREET_ADDRESS" in label_set)

    output_label = f"{has_b_name_student}_{has_i_name_student}_{has_b_email}_{has_i_email}_{has_b_username}_{has_i_username}_{has_b_id_num}_{has_i_id_num}_{has_b_phone_num}_{has_i_phone_num}_{has_b_url_personal}_{has_i_url_personal}_{has_b_street_address}_{has_i_street_address}"

    # 1件しかないデータはOtherとして扱う
    if (
        output_label == "1_1_0_0_0_0_0_0_0_0_1_1_0_0"
        or output_label == "0_0_1_0_0_0_0_0_1_1_0_0_0_0"
        or output_label == "1_1_0_0_1_0_0_0_0_0_0_0_0_0"
        or output_label == "1_1_1_0_1_0_0_0_0_0_0_0_0_0"
        or output_label == "1_1_1_0_0_0_1_0_0_0_0_0_0_0"
        or output_label == "1_1_0_0_0_0_0_0_1_0_1_0_1_1"
        or output_label == "0_0_1_0_0_0_0_0_0_0_1_0_0_0"
        or output_label == "0_0_0_0_0_0_0_0_0_0_0_0_1_1"
        or output_label == "0_0_0_0_0_0_1_1_0_0_0_0_0_0"
    ):
        output_label = "Other"

    return output_label

In [4]:
# 文字列の表示制限を増やす
pl.Config.set_fmt_str_lengths(n=100)

polars.config.Config

In [5]:
train = train.with_columns(
    pl.col("labels").map_elements(get_has_label_seq).alias("pattern")
)

In [6]:
train.get_column("pattern").value_counts(sort=True)

pattern,count
str,u32
"""0_0_0_0_0_0_0_0_0_0_0_0_0_0""",5862
"""1_1_0_0_0_0_0_0_0_0_0_0_0_0""",743
"""1_0_0_0_0_0_0_0_0_0_0_0_0_0""",74
"""0_0_0_0_0_0_0_0_0_0_1_0_0_0""",40
"""1_1_0_0_0_0_0_0_0_0_1_0_0_0""",25
"""1_1_0_0_0_0_1_0_0_0_0_0_0_0""",23
"""1_1_1_0_0_0_0_0_0_0_0_0_0_0""",12
"""Other""",9
"""0_0_0_0_0_0_1_0_0_0_0_0_0_0""",5
"""1_1_1_0_0_0_0_0_0_0_1_0_0_0""",4


In [13]:
train.filter(pl.col("pattern").eq("0_0_0_0_0_0_0_0_0_0_0_0_0_0"))["full_text"][3]

'The development of technology has changed the world, has given mankind new standards of living and  allowed it to work miracles that once seemed fantastic. However, the same technologies have led to a  new stage in the development of society, which is characterized by unreasonable expenditure of  resources and environmental pollution.\n\nTo solve these problems, technical innovations alone will not be enough. Society needs a new approach  to innovation, which would bring the needs of mankind into harmony with the natural world.\n\nDesign thinking can help in this, which is based on the techniques used by designers, but also applicable  in other areas. Design thinking allows each person to use their innate creative abilities.\n\nOnce upon a time, designers were engaged almost exclusively in the design of new products. Design  concepts did not go beyond this special area.\n\nRecently, however, the same principles have been applied not only in the design of specific objects, but  also in

In [7]:
document_pattern_dict = dict(zip(train["document"], train["pattern"]))

In [8]:
import json

with open("../data/document_pattern_dict.json", "w") as f:
    json.dump(document_pattern_dict, f)

ClassLabel(names=['0_0_0_0_0_0_0_0_0_0_0_0_0_0', '0_0_0_0_0_0_0_0_0_0_1_0_0_0', '0_0_0_0_0_0_1_0_0_0_0_0_0_0', '0_0_0_0_1_0_0_0_0_0_0_0_0_0', '0_0_1_0_0_0_0_0_0_0_0_0_0_0', '1_0_0_0_0_0_0_0_0_0_0_0_0_0', '1_0_0_0_0_0_1_0_0_0_0_0_0_0', '1_1_0_0_0_0_0_0_0_0_0_0_0_0', '1_1_0_0_0_0_0_0_0_0_1_0_0_0', '1_1_0_0_0_0_1_0_0_0_0_0_0_0', '1_1_1_0_0_0_0_0_0_0_0_0_0_0', '1_1_1_0_0_0_0_0_0_0_1_0_0_0', '1_1_1_0_0_0_0_0_1_1_0_0_0_0', 'Other'], id=None)