# 目的
- OがLBの評価に使われているかを判断する。
- exp020のLBは0.9110

詳細
- https://www.notion.so/O-sample_sub-0-or-1-6b853c5f23bc4de7a0ed7b6eb194bdb9?pvs=4

In [13]:
import polars as pl

In [14]:
train = pl.read_json("../data/train.json")
data = pl.read_csv("../valid_df/exp020.csv")

In [15]:
train.shape

(6807, 5)

In [16]:
# 全行を表示する
pl.Config.set_tbl_rows(10)

polars.config.Config

# 出力からラベル列を生成する

In [17]:
pred_df = (
    data.select(
        pl.col("document_pred").replace("null", None).cast(pl.Int64),
        pl.col("token_pred").replace("null", None).cast(pl.Int64),
        pl.col("label_pred").replace("null", None),
    )
    .drop_nulls()
    .sort("document_pred")
)

In [18]:
# train_only_valid_document = train.filter(
#     pl.col("document").is_in(
#         pred_df.get_column("document_pred").unique()
#     )
# )

In [19]:
# train_only_valid_document = train_only_valid_document.with_columns(
#     pl.col("tokens").map_elements(len).alias("tokens_len"),
# )

In [20]:
train_with_token_len = train.with_columns(
    pl.col("tokens").map_elements(len).alias("tokens_len"),
)

In [21]:
pred_df_agg_with_len = (
    pred_df.group_by("document_pred")
    .agg(
        pl.col("token_pred"),
        pl.col("label_pred"),
    )
    .join(
        train_with_token_len.select(["document", "tokens_len", "labels"]),
        left_on="document_pred",
        right_on="document",
        how="left",
    )
)

In [22]:
# 推論したlabel列を
label_pred_alls = []
for token_pred, label_pred, tokens_len in zip(
    pred_df_agg_with_len["token_pred"],
    pred_df_agg_with_len["label_pred"],
    pred_df_agg_with_len["tokens_len"],
):
    label_pred_all = ["O" for _ in range(tokens_len)]
    for token, label in zip(token_pred, label_pred):
        label_pred_all[token] = label
    label_pred_alls.append(label_pred_all)

actual_pred_df = pred_df_agg_with_len.with_columns(
    pl.Series("label_pred_all", label_pred_alls)
).select(["labels", "label_pred_all"])

In [23]:
from seqeval.metrics.sequence_labeling import precision_recall_fscore_support

calculated_f1_score = precision_recall_fscore_support(
    actual_pred_df["labels"].to_list(),
    actual_pred_df["label_pred_all"].to_list(),
    beta=1,
    average="micro",
)[2]

calculated_f5_score = precision_recall_fscore_support(
    actual_pred_df["labels"].to_list(),
    actual_pred_df["label_pred_all"].to_list(),
    beta=5,
    average="micro",
)[2]

calculated_f1_score, calculated_f5_score

(0.8816568047337279, 0.9407479358912092)

In [24]:
from seqeval.metrics import classification_report

print(
    classification_report(
        actual_pred_df["labels"].to_list(),
        actual_pred_df["label_pred_all"].to_list(),
    )
)

                precision    recall  f1-score   support

         EMAIL       0.80      1.00      0.89         4
        ID_NUM       0.86      1.00      0.92        18
  NAME_STUDENT       0.83      0.97      0.89       268
     PHONE_NUM       1.00      1.00      1.00         1
STREET_ADDRESS       0.00      0.00      0.00         1
  URL_PERSONAL       0.82      0.67      0.74        21
      USERNAME       1.00      1.00      1.00         2

     micro avg       0.83      0.95      0.88       315
     macro avg       0.76      0.80      0.78       315
  weighted avg       0.83      0.95      0.88       315



# 仮にOが評価される場合

In [33]:
replaced_actual_labels = []
for actual in actual_pred_df["labels"].to_list():
    replaced_actual = []
    for actual_label in actual:
        if actual_label == "O":
            replaced_actual.append("Something")
        else:
            replaced_actual.append(actual_label)
    replaced_actual_labels.append(replaced_actual)

replaced_pred_labels = []
for pred in actual_pred_df["label_pred_all"].to_list():
    replaced_pred = []
    for pred_label in pred:
        if pred_label == "O":
            replaced_pred.append("Something")
        else:
            replaced_pred.append(pred_label)
    replaced_pred_labels.append(replaced_pred)

In [34]:
print(
    classification_report(
        replaced_actual_labels,
        replaced_pred_labels,
    )
)



                precision    recall  f1-score   support

         EMAIL       0.80      1.00      0.89         4
        ID_NUM       0.86      1.00      0.92        18
  NAME_STUDENT       0.83      0.97      0.89       268
     PHONE_NUM       1.00      1.00      1.00         1
STREET_ADDRESS       0.00      0.00      0.00         1
  URL_PERSONAL       0.82      0.67      0.74        21
      USERNAME       1.00      1.00      1.00         2
      omething       1.00      1.00      1.00    147806

     micro avg       1.00      1.00      1.00    148121
     macro avg       0.79      0.83      0.81    148121
  weighted avg       1.00      1.00      1.00    148121



In [35]:
from seqeval.metrics.sequence_labeling import precision_recall_fscore_support

calculated_f1_score = precision_recall_fscore_support(
    replaced_actual_labels,
    replaced_pred_labels,
    beta=1,
    average="micro",
)[2]

calculated_f5_score = precision_recall_fscore_support(
    replaced_actual_labels,
    replaced_pred_labels,
    beta=5,
    average="micro",
)[2]

calculated_f1_score, calculated_f5_score

(0.9994666486632461, 0.9994604200725811)