# 目的
モデルの出力の確認

In [101]:
EXP_NAME = "exp031"
CHECK_DOCUMENT = 5263

# Constant

In [102]:
TRAINED_MODEL_PATH = "../trained_model/exp031"

DATA_PATH = "../data"
MODEL_NAME = "microsoft/deberta-v3-base"
INFERENCE_MAX_LENGTH = 2048

# Install
https://www.kaggle.com/competitions/benetech-making-graphs-accessible/discussion/410130#2258335

In [103]:
# %pip install polars==0.20.10
# %pip install transformers==4.37.2
# %pip install datasets==2.16.1
# %pip install evaluate==0.4.1
# %pip install seqeval==1.2.2
# %pip install accelerate
# %pip install python-dotenv
# %pip install kaggle
# %pip iinstall wandb==0.16.3

# # formatter
# %pip install black isort

# Import

In [104]:
import evaluate
import numpy as np
import polars as pl
import torch
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

# Check Environment

In [105]:
!python --version

Python 3.12.1


In [106]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [107]:
assert transformers.__version__ == "4.37.2"
assert datasets.__version__ == "2.16.1"
assert evaluate.__version__ == "0.4.1"

# Data Load

In [108]:
train_dataset = load_dataset("json", data_files={"train": f"{DATA_PATH}/train.json"})

In [109]:
# 確認したいデータをここで選択する
train_dataset = train_dataset.filter(
    lambda example: example["document"] == CHECK_DOCUMENT, num_proc=3
)

Filter (num_proc=3): 100%|██████████| 6807/6807 [00:02<00:00, 2887.66 examples/s]


In [110]:
model = AutoModelForTokenClassification.from_pretrained(TRAINED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)

In [111]:
tmp = tokenizer("Waseem Mabunda  591 Smith Centers Apt. 656\nJoshuamouth, R")[
    "input_ids"
]
tokenizer.convert_ids_to_tokens(tmp)

['[CLS]',
 '▁Was',
 'eem',
 '▁Mab',
 'unda',
 '▁591',
 '▁Smith',
 '▁Centers',
 '▁Apt',
 '.',
 '▁656',
 '▁Joshua',
 'mouth',
 ',',
 '▁R',
 '[SEP]']

In [112]:
tokenizer.convert_ids_to_tokens([73716])

['▁656']

In [113]:
tmp

[1,
 4045,
 33829,
 42382,
 32141,
 86726,
 2430,
 13291,
 40572,
 260,
 73716,
 10917,
 15052,
 261,
 909,
 2]

# Prediction

In [114]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=True,
        max_length=INFERENCE_MAX_LENGTH,
    )
    # tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [115]:
train_dataset = train_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map: 100%|██████████| 1/1 [00:00<00:00, 150.55 examples/s]


In [116]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [117]:
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)

In [118]:
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

In [119]:
data = train_dataset["train"].select_columns(
    [
        "trailing_whitespace",
        "full_text",
        "tokens",
        "document",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "offset_mapping",
        "token_map",
    ]
)

In [120]:
out = trainer.predict(data).predictions

100%|██████████| 1/1 [00:00<00:00, 1607.01it/s]


In [121]:
logit_df = pl.concat(
    [
        pl.DataFrame(
            tokenizer.convert_ids_to_tokens(data["input_ids"][0]), schema=["token"]
        ),
        pl.DataFrame(
            out[0],
            schema=id2label.values(),
        ),
    ],
    how="horizontal",
)

logit_df

token,O,B-NAME_STUDENT,I-NAME_STUDENT,B-EMAIL,I-EMAIL,B-USERNAME,I-USERNAME,B-ID_NUM,I-ID_NUM,B-PHONE_NUM,I-PHONE_NUM,B-URL_PERSONAL,I-URL_PERSONAL,B-STREET_ADDRESS,I-STREET_ADDRESS
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""[CLS]""",7.837938,-0.548286,-1.55466,-1.504504,-1.404879,-0.939341,-1.89493,-1.320228,-1.883841,-1.998119,-1.724666,-1.158218,-2.188859,-0.823721,-1.319778
"""▁AD""",9.941874,0.641566,-1.864723,-1.694778,-1.967698,-0.763252,-2.600089,-0.862268,-1.449782,-1.782086,-2.484938,-1.074541,-2.209324,0.174751,-2.244348
"""NOC""",9.614044,1.030278,-1.800134,-2.142652,-1.849262,-1.246743,-2.337889,-1.328322,-1.899218,-2.147902,-2.294767,-1.006342,-2.645596,-0.186768,-2.070365
"""▁Classificatio…",11.137218,-1.494807,-2.738708,-1.697794,-1.853534,-1.048887,-3.028638,-1.523829,-1.52644,-2.08411,-2.402944,-1.338592,-2.121798,-0.302402,-1.479711
""":""",11.536344,-1.562932,-2.585722,-1.710817,-1.570119,-1.288658,-2.792442,-1.272335,-1.574869,-2.271307,-2.077088,-1.273511,-2.597064,-0.687606,-1.494393
"""▁Internal""",11.280988,-1.180382,-2.693618,-2.143269,-1.755197,-1.065441,-2.647567,-1.63004,-1.807333,-2.250513,-2.298958,-1.23116,-2.513963,-0.394689,-1.443802
"""▁Design""",11.37915,-0.961222,-2.488499,-1.863,-1.603097,-0.880337,-2.734669,-1.751169,-1.672943,-2.291797,-2.518503,-0.773505,-2.67154,-0.132656,-1.456066
"""▁Thinking""",11.348434,-1.141789,-2.739595,-1.766582,-1.69374,-0.916246,-2.842485,-1.704886,-1.838226,-2.117887,-2.604051,-1.065134,-2.536335,-0.315047,-1.52093
"""▁for""",11.449734,-1.57689,-2.974465,-1.893133,-1.759711,-0.921841,-2.668398,-1.439265,-1.512708,-2.2962,-2.185898,-1.150857,-2.850394,-0.826706,-1.243927
"""▁Innovation""",11.19507,-1.166742,-2.877737,-1.815944,-1.615722,-1.00072,-3.014704,-1.643574,-1.732364,-2.086534,-2.320774,-1.159421,-2.436386,-0.185145,-1.533602


In [122]:
logit_df.write_csv(f"output/{EXP_NAME}_document{CHECK_DOCUMENT}_logit.csv")

In [123]:
# predictions = trainer.predict(data).predictions
# pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
#     predictions.shape[0], predictions.shape[1], 1
# )
# preds_final = predictions.argmax(-1)
# preds_final

In [124]:
# triplets = []
# document, token, label, token_str = [], [], [], []
# for p, token_map, offsets, tokens, doc in zip(
#     preds_final,
#     data["token_map"],
#     data["offset_mapping"],
#     data["tokens"],
#     data["document"],
# ):
#     for token_pred, (start_idx, end_idx) in zip(p, offsets):
#         label_pred = id2label[token_pred]

#         if start_idx + end_idx == 0:
#             continue

#         if token_map[start_idx] == -1:
#             start_idx += 1

#         # ignore "\n\n"
#         while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
#             start_idx += 1

#         if start_idx >= len(token_map):
#             break

#         token_id = token_map[start_idx]

#         # ignore "O" predictions and whitespace preds
#         if label_pred != "O" and token_id != -1:
#             triplet = (label_pred, token_id, tokens[token_id])

#             if triplet not in triplets:
#                 document.append(doc)
#                 token.append(token_id)
#                 label.append(label_pred)
#                 token_str.append(tokens[token_id])
#                 triplets.append(triplet)

In [125]:
# output_df = pl.DataFrame(
#     [document, token, label, token_str],
#     schema=["document", "token", "label", "token_str"],
# )

# output_df

In [126]:
# predictions = trainer.predict(test_dataset).predictions
# pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
#     predictions.shape[0], predictions.shape[1], 1
# )
# preds_final = predictions.argmax(-1)