# 目的
モデルの出力の確認

In [69]:
EXP_NAME = "exp051"
CHECK_DOCUMENT = 9760

# Constant

In [70]:
TRAINED_MODEL_PATH = "pll_data_detection/trained_models/e050-add-epoch"

DATA_PATH = "pll_data_detection/data"
MODEL_NAME = "microsoft/deberta-v3-large"

# Install
https://www.kaggle.com/competitions/benetech-making-graphs-accessible/discussion/410130#2258335

In [71]:
# %pip install polars==0.20.10
# %pip install transformers==4.37.2
# %pip install datasets==2.16.1
# %pip install evaluate==0.4.1
# %pip install seqeval==1.2.2
# %pip install accelerate
# %pip install python-dotenv
# %pip install kaggle
# %pip iinstall wandb==0.16.3

# # formatter
# %pip install black isort

# Import

In [72]:
import evaluate
import numpy as np
import polars as pl
import torch
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
import transformers
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

In [73]:
# Seed the same seed to all
import random
import os

SEED = 42


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

# Check Environment

In [74]:
!python --version

Python 3.9.16


In [75]:
!nvidia-smi

Wed Mar 20 05:16:33 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:00:05.0 Off |                  Off |
| 30%   39C    P8    23W / 300W |   4565MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [76]:
assert transformers.__version__ == "4.37.2"
assert datasets.__version__ == "2.16.1"
assert evaluate.__version__ == "0.4.1"

# Data Load

In [77]:
train_dataset = load_dataset("json", data_files={"train": f"{DATA_PATH}/train.json"})

In [78]:
# 確認したいデータをここで選択する
train_dataset = train_dataset.filter(
    lambda example: example["document"] == CHECK_DOCUMENT, num_proc=3
)

Filter (num_proc=3):   0%|          | 0/6807 [00:00<?, ? examples/s]

In [79]:
model = AutoModelForTokenClassification.from_pretrained(TRAINED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Prediction

In [80]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [81]:
train_dataset = train_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [82]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [83]:
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)

In [84]:
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

In [85]:
data = train_dataset["train"].select_columns(
    [
        "trailing_whitespace",
        "full_text",
        "tokens",
        "document",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "offset_mapping",
        "token_map",
    ]
)

In [86]:
out = trainer.predict(data).predictions

In [87]:
logit_df = pl.concat(
    [
        pl.DataFrame(
            tokenizer.convert_ids_to_tokens(data["input_ids"][0]), schema=["token"]
        ),
        pl.DataFrame(
            out[0],
            schema=id2label.values(),
        ),
    ],
    how="horizontal",
)

logit_df

token,O,B-NAME_STUDENT,I-NAME_STUDENT,B-EMAIL,I-EMAIL,B-USERNAME,I-USERNAME,B-ID_NUM,I-ID_NUM,B-PHONE_NUM,I-PHONE_NUM,B-URL_PERSONAL,I-URL_PERSONAL,B-STREET_ADDRESS,I-STREET_ADDRESS
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""[CLS]""",11.04885,0.44001,-1.186121,-2.170916,-1.243217,-0.651233,-1.235379,-0.202882,-2.279014,-1.469696,-1.892465,-1.019939,-1.869305,0.30078,-0.61287
"""▁Name""",12.910772,1.136533,0.160565,-2.020278,-2.014395,-0.641895,-2.440061,-0.790824,-1.62418,-1.148059,-2.24225,-1.149885,-1.48836,0.211409,-0.698627
""":""",12.595451,0.43406,0.208871,-2.10352,-2.406795,-0.854782,-2.367032,-0.579656,-1.830994,-0.894282,-2.427516,-0.793023,-1.752013,-0.199894,-1.316524
"""▁Bk""",3.793081,6.185917,1.826345,-2.511793,-1.729593,-0.944009,-1.917012,0.137435,-1.39509,-1.234843,-2.060781,-1.602226,-1.492442,0.683547,-1.916264
"""▁Bk""",4.530054,5.017063,4.221374,-1.729772,-2.122084,-0.287324,-2.03901,0.354104,-0.750184,-1.782859,-2.639889,-1.965319,-1.582808,1.232731,-1.147424
"""▁Roll""",13.236468,0.568029,-0.225706,-2.328736,-1.927059,-0.16618,-2.818129,-0.421243,-1.623899,-1.235324,-1.725881,-1.484768,-0.855491,0.977858,-0.991793
"""▁No""",13.744381,-0.716081,-0.260859,-2.035801,-2.255582,0.093179,-2.427076,-0.408077,-1.756567,-0.903704,-2.327629,-1.593311,-1.252056,0.441623,-1.170448
""":""",12.443534,-0.425047,-0.384406,-1.870055,-2.519083,-0.855654,-2.010065,0.250098,-1.255611,-0.114158,-2.719435,-1.286022,-1.647909,-0.020091,-1.364368
"""▁172""",1.961098,0.883388,-0.881172,-0.607576,-0.751658,0.45317,-1.637036,9.831906,-0.402555,-0.265344,-2.197389,-1.599271,-1.228016,-0.617258,-1.866018
"""801""",1.959107,0.755942,-0.316336,-0.648564,-0.852651,0.548146,-1.889917,9.638918,0.151647,-0.188268,-1.765705,-1.545599,-1.772728,-0.43244,-1.26797


In [88]:
# logit_df.write_csv(f"output/{EXP_NAME}_document{CHECK_DOCUMENT}_logit.csv") # vscode
logit_df.write_csv(f"{EXP_NAME}_document{CHECK_DOCUMENT}_logit.csv")  # jupyterlab

In [89]:
# predictions = trainer.predict(data).predictions
# pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
#     predictions.shape[0], predictions.shape[1], 1
# )
# preds_final = predictions.argmax(-1)
# preds_final

In [90]:
# triplets = []
# document, token, label, token_str = [], [], [], []
# for p, token_map, offsets, tokens, doc in zip(
#     preds_final,
#     data["token_map"],
#     data["offset_mapping"],
#     data["tokens"],
#     data["document"],
# ):
#     for token_pred, (start_idx, end_idx) in zip(p, offsets):
#         label_pred = id2label[token_pred]

#         if start_idx + end_idx == 0:
#             continue

#         if token_map[start_idx] == -1:
#             start_idx += 1

#         # ignore "\n\n"
#         while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
#             start_idx += 1

#         if start_idx >= len(token_map):
#             break

#         token_id = token_map[start_idx]

#         # ignore "O" predictions and whitespace preds
#         if label_pred != "O" and token_id != -1:
#             triplet = (label_pred, token_id, tokens[token_id])

#             if triplet not in triplets:
#                 document.append(doc)
#                 token.append(token_id)
#                 label.append(label_pred)
#                 token_str.append(tokens[token_id])
#                 triplets.append(triplet)

In [91]:
# output_df = pl.DataFrame(
#     [document, token, label, token_str],
#     schema=["document", "token", "label", "token_str"],
# )

# output_df

In [92]:
# predictions = trainer.predict(test_dataset).predictions
# pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
#     predictions.shape[0], predictions.shape[1], 1
# )
# preds_final = predictions.argmax(-1)