# 目的
モデルの出力の漏れの確認

In [2]:
CHECK_DOCUMENT = 2804

# Constant

In [3]:
TRAINED_MODEL_PATH = "../trained_model/exp024"

DATA_PATH = "../data"
MODEL_NAME = "microsoft/deberta-v3-base"
INFERENCE_MAX_LENGTH = 2048

# Install
https://www.kaggle.com/competitions/benetech-making-graphs-accessible/discussion/410130#2258335

In [4]:
# %pip install polars==0.20.10
# %pip install transformers==4.37.2
# %pip install datasets==2.16.1
# %pip install evaluate==0.4.1
# %pip install seqeval==1.2.2
# %pip install accelerate
# %pip install python-dotenv
# %pip install kaggle
# %pip iinstall wandb==0.16.3

# # formatter
# %pip install black isort

# Import

In [5]:
import evaluate
import numpy as np
import polars as pl
import torch
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


# Check Environment

In [6]:
!python --version

Python 3.12.1


In [7]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [8]:
assert transformers.__version__ == "4.37.2"
assert datasets.__version__ == "2.16.1"
assert evaluate.__version__ == "0.4.1"

# Data Load

In [9]:
train_dataset = load_dataset("json", data_files={"train": f"{DATA_PATH}/train.json"})

In [10]:
# 確認したいデータをここで選択する
train_dataset = train_dataset.filter(
    lambda example: example["document"] == CHECK_DOCUMENT, num_proc=3
)

Filter (num_proc=3): 100%|██████████| 6807/6807 [00:02<00:00, 2934.92 examples/s]


In [11]:
model = AutoModelForTokenClassification.from_pretrained(TRAINED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)

In [12]:
tmp = tokenizer("Waseem Mabunda  591 Smith Centers Apt. 656\nJoshuamouth, R")[
    "input_ids"
]
tokenizer.convert_ids_to_tokens(tmp)

['[CLS]',
 '▁Was',
 'eem',
 '▁Mab',
 'unda',
 '▁591',
 '▁Smith',
 '▁Centers',
 '▁Apt',
 '.',
 '▁656',
 '▁Joshua',
 'mouth',
 ',',
 '▁R',
 '[SEP]']

In [13]:
tokenizer.convert_ids_to_tokens([73716])

['▁656']

In [14]:
tmp

[1,
 4045,
 33829,
 42382,
 32141,
 86726,
 2430,
 13291,
 40572,
 260,
 73716,
 10917,
 15052,
 261,
 909,
 2]

# Prediction

In [15]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=True,
        max_length=INFERENCE_MAX_LENGTH,
    )
    # tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [16]:
train_dataset = train_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map: 100%|██████████| 1/1 [00:00<00:00, 48.89 examples/s]


In [17]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [18]:
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)

In [19]:
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

In [20]:
data = train_dataset["train"].select_columns(
    [
        "trailing_whitespace",
        "full_text",
        "tokens",
        "document",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "offset_mapping",
        "token_map",
    ]
)

In [23]:
test_dataset = data

In [24]:
predictions = trainer.predict(test_dataset).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
    predictions.shape[0], predictions.shape[1], 1
)
preds_final = predictions.argmax(-1)

100%|██████████| 1/1 [00:00<00:00, 1349.52it/s]


# Make CV DataFrame

In [26]:
def get_valid_preds(trainer, valid_dataset):
    """
    trainerを用いてvalid_datasetに対する予測を行う
    """
    predictions = trainer.predict(valid_dataset).predictions
    preds_final = predictions.argmax(-1)

    return preds_final


def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=True,
        max_length=INFERENCE_MAX_LENGTH,
    )

    return {
        **tokenized,
        "token_map": token_map,
    }


def get_output_part(preds_final, valid_dataset):
    triplets = []
    document, token, label, token_str = [], [], [], []
    # token_mapはoffsetsの文字列indexが、何番目のtokenの紐付け
    # start_idx = 0の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 1の時、tokens[token_map[start_idx]] → 'Design'
    # ・・・
    # start_idx = 5の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 6の時、tokens[token_map[start_idx]] → '\n\n'

    # 同じtoken_idに、二つの予測結果が入ってしまう問題
    # 原因は、「文字列をtokenizerで区切り、別のlabelと予測した場合を、別のtripletとして扱うから」
    # 例えば、'kellyharrison@gmail.com'のlabelの予測結果
    # '\nkelly' -> 'B-EMAIL'
    # '##harris' -> 'B-EMAIL'
    # '##on' -> 'I-NAME_STUDENT'
    # '@' -> 'B-EMAIL'
    # 'gmail' -> 'B-EMAIL'
    # '.' -> 'B-EMAIL'
    # 'com' -> 'B-EMAIL'
    # 簡易的な解決策: 同じtoken_idの場合は、tripletに追加しない
    # 根本的な解決策: サブワードの推論結果は無視する？（サブワードの話ではないかも。もうちょっと考えたほうがよさそう。）

    for p, token_map, offsets, tokens, doc in zip(
        preds_final,
        valid_dataset["token_map"],
        valid_dataset["offset_mapping"],
        valid_dataset["tokens"],
        valid_dataset["document"],
    ):
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    return document, token, label, token_str


def make_correct_df(train: pl.DataFrame):
    # 学習データから、outputと同様のデータフレームを作成する
    outputs = []
    for document_id, token, label in zip(
        train["document"], train["tokens"], train["provided_labels"]
    ):
        for token, (token_str, label_one) in enumerate(zip(token, label)):
            if label_one != "O":
                outputs.append((document_id, token, label_one, token_str))
    return pl.DataFrame(outputs, schema=["document", "token", "label", "token_str"])


def make_correct_pred_join_df(
    train_correct_df: pl.DataFrame, valid_pred_df: pl.DataFrame
) -> pl.DataFrame:
    """
    validで利用したdocumentのみを抽出し、train_correct_dfとvalid_pred_dfを結合して、documentごとに比較できるようにする
    """
    out = train_correct_df.filter(
        pl.col("document").is_in(valid_pred_df["document"])
    ).join(valid_pred_df, on=["document", "token"], how="outer", suffix="_pred")

    joined_dfs = []
    for document in out["document"].unique().to_list():
        if document is None:
            continue
        joined_df_per_document = out.filter(
            (pl.col("document") == document) | (pl.col("document_pred") == document)
        )
        joined_dfs.append(joined_df_per_document)

    return pl.concat(joined_dfs)

In [31]:
# main
train_dataset = train_dataset["train"].rename_column("labels", "provided_labels")
valid_dataset = data

valid_dataset = valid_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

valid_preds = get_valid_preds(trainer, valid_dataset)

document, token, label, token_str = get_output_part(valid_preds, valid_dataset)

valid_pred_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)

train_correct_df = make_correct_df(pl.from_pandas(train_dataset.to_pandas()))
# train_correct_df = make_correct_df(train)

valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


100%|██████████| 1/1 [00:00<00:00, 1270.62it/s]


In [32]:
valid_correct_pred_df

document,token,label,token_str,document_pred,token_pred,label_pred,token_str_pred
i64,i64,str,str,i64,i64,str,str
2804,0,"""B-NAME_STUDENT…","""Maicol""",2804,0,"""B-NAME_STUDENT…","""Maicol"""
2804,1,"""I-NAME_STUDENT…","""Toro""",2804,1,"""I-NAME_STUDENT…","""Toro"""
