# 目的
モデルの出力の確認

In [244]:
CHECK_DOCUMENT = 2804

# Constant

In [245]:
TRAINED_MODEL_PATH = "../trained_model/exp024"

DATA_PATH = "../data"
MODEL_NAME = "microsoft/deberta-v3-base"
INFERENCE_MAX_LENGTH = 2048

# Install
https://www.kaggle.com/competitions/benetech-making-graphs-accessible/discussion/410130#2258335

In [246]:
# %pip install polars==0.20.10
# %pip install transformers==4.37.2
# %pip install datasets==2.16.1
# %pip install evaluate==0.4.1
# %pip install seqeval==1.2.2
# %pip install accelerate
# %pip install python-dotenv
# %pip install kaggle
# %pip iinstall wandb==0.16.3

# # formatter
# %pip install black isort

# Import

In [247]:
import evaluate
import numpy as np
import polars as pl
import torch
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

# Check Environment

In [248]:
!python --version

Python 3.12.1


In [249]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [250]:
assert transformers.__version__ == "4.37.2"
assert datasets.__version__ == "2.16.1"
assert evaluate.__version__ == "0.4.1"

# Data Load

In [251]:
train_dataset = load_dataset("json", data_files={"train": f"{DATA_PATH}/train.json"})

In [252]:
# 確認したいデータをここで選択する
train_dataset = train_dataset.filter(
    lambda example: example["document"] == CHECK_DOCUMENT, num_proc=3
)

In [253]:
model = AutoModelForTokenClassification.from_pretrained(TRAINED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)

In [254]:
tmp = tokenizer("Waseem Mabunda  591 Smith Centers Apt. 656\nJoshuamouth, R")[
    "input_ids"
]
tokenizer.convert_ids_to_tokens(tmp)

['[CLS]',
 '▁Was',
 'eem',
 '▁Mab',
 'unda',
 '▁591',
 '▁Smith',
 '▁Centers',
 '▁Apt',
 '.',
 '▁656',
 '▁Joshua',
 'mouth',
 ',',
 '▁R',
 '[SEP]']

In [255]:
tokenizer.convert_ids_to_tokens([73716])

['▁656']

In [256]:
tmp

[1,
 4045,
 33829,
 42382,
 32141,
 86726,
 2430,
 13291,
 40572,
 260,
 73716,
 10917,
 15052,
 261,
 909,
 2]

# Prediction

In [257]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=True,
        max_length=INFERENCE_MAX_LENGTH,
    )
    # tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [258]:
train_dataset = train_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


In [259]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [260]:
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)

In [261]:
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

In [262]:
data = train_dataset["train"].select_columns(
    [
        "trailing_whitespace",
        "full_text",
        "tokens",
        "document",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "offset_mapping",
        "token_map",
    ]
)

In [263]:
out = trainer.predict(data).predictions

100%|██████████| 1/1 [00:00<00:00, 419.98it/s]


In [264]:
logit_df = pl.concat(
    [
        pl.DataFrame(
            tokenizer.convert_ids_to_tokens(data["input_ids"][0]), schema=["token"]
        ),
        pl.DataFrame(
            out[0],
            schema=id2label.values(),
        ),
    ],
    how="horizontal",
)

logit_df

token,O,B-NAME_STUDENT,I-NAME_STUDENT,B-EMAIL,I-EMAIL,B-USERNAME,I-USERNAME,B-ID_NUM,I-ID_NUM,B-PHONE_NUM,I-PHONE_NUM,B-URL_PERSONAL,I-URL_PERSONAL,B-STREET_ADDRESS,I-STREET_ADDRESS
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""[CLS]""",9.436209,-0.100948,-1.020993,-2.01943,-1.73673,-2.230707,-1.54927,-1.93825,-0.465267,-1.192132,-0.539012,-1.623414,-1.103013,-0.627131,-0.784668
"""▁Mai""",1.159454,7.568308,-0.312837,-0.647191,-1.243123,-0.197258,-0.902603,-1.755704,-1.706048,-0.974412,-2.088173,-1.941469,-1.706681,0.916169,-1.287308
"""col""",1.124249,7.813073,0.161078,-1.033913,-1.593857,-0.108133,-1.001023,-2.145751,-1.622348,-0.38215,-2.040435,-1.789871,-1.318204,0.69517,-1.331388
"""▁Toro""",1.476706,0.501315,7.963993,-1.452245,-1.236041,-0.751044,-1.262135,-0.390905,-1.455048,-0.854642,-0.808904,-1.440665,-1.596767,-0.700916,-1.009426
"""▁Reflection""",10.714262,0.127162,0.151683,-2.917443,-2.397041,-2.512652,-1.946224,-2.039197,-1.424484,-1.595968,-1.83955,-1.58248,-2.217337,0.820619,-1.151044
"""▁-""",11.057509,0.269149,-0.186961,-2.667807,-2.388384,-2.512439,-1.699139,-1.761003,-1.495107,-1.04516,-1.229051,-1.353144,-2.137986,0.246086,-0.698996
"""▁Storytelling""",11.226655,1.516809,-0.673215,-2.86155,-2.566422,-2.526584,-2.031832,-2.518571,-1.507164,-1.303853,-1.928729,-1.723491,-2.132847,1.045223,-1.245594
"""▁Challenge""",11.906203,0.701263,-0.534744,-2.932057,-2.603152,-2.458041,-1.99006,-2.591805,-1.649032,-1.587592,-1.638152,-1.668367,-2.095124,0.7547,-1.080184
"""▁&""",11.978357,0.014843,-0.987001,-2.626772,-2.20437,-2.472258,-1.84455,-2.316189,-1.093384,-1.519852,-1.212774,-1.69384,-2.248641,0.506151,-0.736924
"""▁Selection""",11.902819,0.321159,-0.806204,-2.810894,-2.324125,-2.525705,-1.868189,-2.424431,-1.260944,-1.635601,-1.693804,-1.818183,-2.221089,0.536773,-0.751121


In [265]:
# logit_df.write_csv(f"output/exp024_document{CHECK_DOCUMENT}_logit.csv")

In [267]:
predictions = trainer.predict(data).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
    predictions.shape[0], predictions.shape[1], 1
)
preds_final = predictions.argmax(-1)
preds_final

100%|██████████| 1/1 [00:00<00:00, 1442.83it/s]


array([[0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [268]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(
    preds_final,
    data["token_map"],
    data["offset_mapping"],
    data["tokens"],
    data["document"],
):
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[token_pred]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

In [269]:
output_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)

output_df

document,token,label,token_str
i64,i64,str,str
2804,0,"""B-NAME_STUDENT…","""Maicol"""
2804,1,"""I-NAME_STUDENT…","""Toro"""


In [266]:
predictions = trainer.predict(test_dataset).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
    predictions.shape[0], predictions.shape[1], 1
)
preds_final = predictions.argmax(-1)

100%|██████████| 10/10 [00:08<00:00,  1.19it/s]
