# 目的
- 推論時のtokenizeと学習時のtokenizeのやり方を合わせる

In [1]:
EXP_NAME = "e04-match-tokenize"
MODEL_NAME = "microsoft/deberta-v3-base"
DATA_PATH = "pll_data_detection/data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME}"
LOG_PATH = f"pll_data_detection/log/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"pll_data_detection/trained_models/{EXP_NAME}"
DEBUG = True
UPLOAD_DATA = False
TRAINING_MAX_LENGTH = 1024  # NOTE: ほとんどOなのに、後半のtokenに正解があると、truncationで落ちてしまって勿体無い

# Install

In [2]:
%pip install polars
%pip install transformers==4.37.2
%pip install datasets==2.16.1
%pip install evaluate==0.4.1
%pip install seqeval==1.2.2
%pip install accelerate
%pip install python-dotenv
%pip install kaggle
%pip iinstall wandb==0.16.3

# formatter
%pip install black isort

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
ERROR: unknown command "iinstall" - maybe you meant "install"
Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


# Import

In [3]:
import os

import evaluate
import numpy as np
import polars as pl
import torch
import wandb
from datasets import DatasetDict, load_dataset
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm.auto import tqdm
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import transformers

assert transformers.__version__ == "4.37.2"

In [5]:
import datasets

assert datasets.__version__ == "2.16.1"

In [6]:
import evaluate

assert evaluate.__version__ == "0.4.1"

# Wandb

In [7]:
from dotenv import load_dotenv

if not DEBUG:
    load_dotenv("pll_data_detection/.env")
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project="pll", name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

'none'

# Check Environment

In [8]:
!python --version

Python 3.9.16


In [9]:
!nvidia-smi

Sun Feb 25 02:00:05 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:00:05.0 Off |                  Off |
| 30%   40C    P8    33W / 300W |      3MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Data Load

In [10]:
# データ確認用
train = pl.read_json(f"{DATA_PATH}/train.json")
train.head()

document,full_text,tokens,trailing_whitespace,labels
i64,str,list[str],list[bool],list[str]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]","[""O"", ""O"", … ""O""]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]","[""O"", ""O"", … ""O""]"


In [11]:
# データ確認用
test = pl.read_json(f"{DATA_PATH}/test.json")

In [12]:
train_dataset = load_dataset(
    "json", data_files={"train": f"{DATA_PATH}/train.json"}, split="train"
).rename_column("labels", "provided_labels")

test_dataset = load_dataset(
    "json", data_files={"test": f"{DATA_PATH}/test.json"}, split="test"
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [13]:
# debug
if DEBUG:
    train_dataset = train_dataset.select(range(100))

In [14]:
train_dataset

Dataset({
    features: ['full_text', 'provided_labels', 'document', 'tokens', 'trailing_whitespace'],
    num_rows: 100
})

In [15]:
test_dataset

Dataset({
    features: ['tokens', 'full_text', 'document', 'trailing_whitespace'],
    num_rows: 10
})

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [17]:
# TODO: 512?が上限であるのに対し、NER対象のテキストが512よりも長い場合が多いため、適したモデルにする
# DeBERTaあたりは、512を超えるテキストに対応していたような
# example = train_dataset[30]
example = train_dataset[0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '▁Design',
 '▁Thinking',
 '▁for',
 '▁innovation',
 '▁reflex',
 'ion',
 '▁-',
 '▁Avril',
 '▁2021',
 '▁-',
 '▁Nathalie',
 '▁S',
 'ylla',
 '▁Challenge',
 '▁&',
 '▁selection',
 '▁The',
 '▁tool',
 '▁I',
 '▁use',
 '▁to',
 '▁help',
 '▁all',
 '▁stakeholders',
 '▁finding',
 '▁their',
 '▁way',
 '▁through',
 '▁the',
 '▁complexity',
 '▁of',
 '▁a',
 '▁project',
 '▁is',
 '▁the',
 '▁mind',
 '▁map',
 '▁.',
 '▁What',
 '▁exactly',
 '▁is',
 '▁a',
 '▁mind',
 '▁map',
 '▁?',
 '▁According',
 '▁to',
 '▁the',
 '▁definition',
 '▁of',
 '▁Buz',
 'an',
 '▁T',
 '.',
 '▁and',
 '▁Buz',
 'an',
 '▁B',
 '.',
 '▁(',
 '▁1999',
 '▁,',
 '▁Des',
 's',
 'ine',
 '▁-',
 '▁moi',
 '▁l',
 "'",
 'intelligence',
 '▁.',
 '▁Paris',
 '▁:',
 '▁Les',
 '▁É',
 'dition',
 's',
 '▁d',
 "'",
 'Organ',
 'isation',
 '▁.',
 '▁)',
 '▁,',
 '▁the',
 '▁mind',
 '▁map',
 '▁(',
 '▁or',
 '▁heuristic',
 '▁diagram',
 '▁)',
 '▁is',
 '▁a',
 '▁graphic',
 '▁representation',
 '▁technique',
 '▁that',
 '▁follows',
 '▁the',
 '▁natural',
 '▁functioning'

In [21]:
text = "this is kinlearn@gmail.com."
tokenized_input = tokenizer(text)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '▁this',
 '▁is',
 '▁kin',
 'learn',
 '@',
 'gmail',
 '.',
 'com',
 '.',
 '[SEP]']

In [18]:
# labelを変換する
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

label2id = {v: k for k, v in id2label.items()}


# def label2id_func(example):
#     example["labels"] = [label2id[tag] for tag in example["labels"]]
#     return example


# labele2id_train_dataset = train_dataset.map(label2id_func)

In [19]:
def tokenize(example, tokenizer, label2id, max_length):
    """
    与えられたtokenとlabelから、
    今回のtokenizerで区切った場合のtokenとlabelを作成する。
    """
    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        # 文字数分だけ、該当のラベルを追加する
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")
        # text -> ['Design', ' ']
        # labels -> ['O', 'O', 'O', 'O', 'O', 'O', 'O'] (6文字分 + 空白1文字分)

    # actual tokenization
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        max_length=max_length,
        truncation=True,
    )

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # offset_mappingの各エントリは、トークンが元のテキストのどの範囲（開始位置と終了位置）にマッピングされるかを示す
        # タプルまたはリストで構成されます。

        # CLS tokenの対応
        # CLSやSEPには必ず(start_idx, end_idx) = (0, 0)が割り当てられる
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # 空白が存在する時は、offset_mappingのstart_idxを+1する
        # DeBERTaV2Tokenizerは、空白を文字の先頭に▁としてくっつけるため。
        # NOTE: もし空白を▁として文字の先頭にくっつけるないtokenizerの場合は、不要
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    # Q: token_labelsは何の長さ？
    # A: 今回のtokenizerで区切った時の、tokenに該当するlabel
    # 例:
    # 与えられたtoken example["tokens"][:10]
    # -> ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
    # 今回のtokenizerで区切ったtoken　tokenizer.convert_ids_to_tokens(tokenized.input_ids[:10])
    # -> ['[CLS]', '▁Design', '▁Thinking', '▁for', '▁innovation', '▁reflex', 'ion', '-', 'Av', 'ril']
    # 文字数が違う！！！
    # 最初に与えられたtokenとそのlabelだと、今回のtokenizerで区切った場合のラベルが分からない。
    # 今回のtokenizerで区切った場合のtokenとラベルを作成した。

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}


train_dataset = train_dataset.map(
    tokenize,
    fn_kwargs={
        "tokenizer": tokenizer,
        "label2id": label2id,
        "max_length": TRAINING_MAX_LENGTH,
    },
    num_proc=3,
)

Map (num_proc=3):   0%|          | 0/6807 [00:00<?, ? examples/s]

In [20]:
x = train_dataset[2]

for t, l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t, l))

print("*" * 100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t, id2label[l]))

('Gilberto', 'B-NAME_STUDENT')
('Gamboa', 'I-NAME_STUDENT')
****************************************************************************************************
('▁Gilberto', 'B-NAME_STUDENT')
('▁Gamb', 'I-NAME_STUDENT')
('oa', 'I-NAME_STUDENT')


In [21]:
label_list = list(label2id.keys())
label_list

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-EMAIL',
 'I-EMAIL',
 'B-USERNAME',
 'I-USERNAME',
 'B-ID_NUM',
 'I-ID_NUM',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-URL_PERSONAL',
 'I-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [22]:
len(label_list)

15

In [23]:
# def tokenize_and_align_labels(examples):
#     """
#     tokenizeした文字列をinput_idsとして追加する
#     labelsには、special tokenが-100として扱われたデータが入る
#     """
#     tokenized_inputs = tokenizer(
#         examples["tokens"],
#         truncation=True,
#         is_split_into_words=True,
#         max_length=TRAINING_MAX_LENGTH,
#     )

#     labels = []
#     for i, label in enumerate(examples["labels"]):
#         # トークン化されたシーケンス内の各トークンが元のテキスト内のどの単語に対応するかを示すIDを提供します。
#         # 例えば、元のテキストが "Hello, world!" で、トークン化されたシーケンスが ["Hello", ",", "world", "!"] の場合
#         # `word_ids`メソッドは [0, None, 1, None] を返します。
#         # これは、"Hello" が最初の単語（インデックス0）、"," が単語に属さない（None）、"world" が2番目の単語（インデックス1）、"!" が単語に属さない（None）ことを示します。
#         word_ids = tokenized_inputs.word_ids(batch_index=i)
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif (
#                 word_idx != previous_word_idx
#             ):  # subwordの場合、同じword_idxが連続している。最初のtoken以外は-100にする
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs


# tokenized_train_dataset = labele2id_train_dataset.map(
#     tokenize_and_align_labels, batched=True
# )

In [24]:
# tokenized_train_dataset

In [25]:
# pad_to_multiple_of
# paddingの際に、指定した数の倍数になるように、各サンプルの長さを揃える
# ハードウェアの要件に合致することで、計算効率が良くなる可能性がある
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=16
)

In [26]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [27]:
def f5_score(precision: float, recall: float, beta: int = 5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # seqevalのmetrics関数を使用して、精度、再現率、F1スコア、正解率を計算
    precision = results["overall_precision"]
    recall = results["overall_recall"]

    return {
        "precision": precision,
        "recall": recall,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "f5score": f5_score(precision, recall),
    }

In [28]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# evalデータの用意
split_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
train_valid_dataset = DatasetDict(
    {"train": split_dataset["train"], "valid": split_dataset["test"]}
)

In [30]:
train_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'tokens', 'full_text', 'trailing_whitespace', 'provided_labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'],
        num_rows: 5445
    })
    valid: Dataset({
        features: ['document', 'tokens', 'full_text', 'trailing_whitespace', 'provided_labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'],
        num_rows: 1362
    })
})

In [31]:
training_args = TrainingArguments(
    output_dir=LOG_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # 32はだめ、性能がepoch0の時、precision0.31→0.00、Recall0.25→0.00に落ちる
    per_device_eval_batch_size=16,  # 32,↑同様、バッチサイズが非常に重要なパラメーターであるとも言える
    # num_train_epochs=2,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model="f5score",  # add
    greater_is_better=True,  # add
    warmup_ratio=0.1,  # add
    lr_scheduler_type="cosine",  # add
    report_to=REPORT_TO,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
# # モデルの学習
# trainer.train()
# cv_score = trainer.evaluate()["eval_f5score"]
# # モデルの保存
# trainer.save_model(MODEL_OUTPUT_PATH)

In [34]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_OUTPUT_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_PATH)

args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Make CV DataFrame

In [50]:
def get_valid_preds(trainer, valid_dataset):
    """
    trainerを用いてvalid_datasetに対する予測を行う
    """
    predictions = trainer.predict(valid_dataset).predictions
    preds_final = predictions.argmax(-1)

    return preds_final

In [51]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    # tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [75]:
def get_output_part(preds_final, valid_dataset):
    triplets = []
    document, token, label, token_str = [], [], [], []
    # token_mapはoffsetsの文字列indexが、何番目のtokenの紐付け
    # start_idx = 0の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 1の時、tokens[token_map[start_idx]] → 'Design'
    # ・・・
    # start_idx = 5の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 6の時、tokens[token_map[start_idx]] → '\n\n'

    # 同じtoken_idに、二つの予測結果が入ってしまう問題
    # 原因は、「文字列をtokenizerで区切り、別のlabelと予測した場合を、別のtripletとして扱うから」
    # 例えば、'kellyharrison@gmail.com'のlabelの予測結果
    # '\nkelly' -> 'B-EMAIL'
    # '##harris' -> 'B-EMAIL'
    # '##on' -> 'I-NAME_STUDENT'
    # '@' -> 'B-EMAIL'
    # 'gmail' -> 'B-EMAIL'
    # '.' -> 'B-EMAIL'
    # 'com' -> 'B-EMAIL'
    # 簡易的な解決策: 同じtoken_idの場合は、tripletに追加しない
    # 根本的な解決策: サブワードの推論結果は無視する？（サブワードの話ではないかも。もうちょっと考えたほうがよさそう。）

    for p, token_map, offsets, tokens, doc in zip(
        preds_final,
        valid_dataset["token_map"],
        valid_dataset["offset_mapping"],
        valid_dataset["tokens"],
        valid_dataset["document"],
    ):
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    return document, token, label, token_str

In [53]:
def make_correct_df(train: pl.DataFrame):
    # 学習データから、outputと同様のデータフレームを作成する
    outputs = []
    for document_id, token, label in zip(
        train["document"], train["tokens"], train["labels"]
    ):
        for token, (token_str, label_one) in enumerate(zip(token, label)):
            if label_one != "O":
                outputs.append((document_id, token, token_str, label_one))
    return pl.DataFrame(outputs, schema=["document", "token", "label", "token_str"])

In [54]:
def make_correct_pred_join_df(
    train_correct_df: pl.DataFrame, valid_pred_df: pl.DataFrame
) -> pl.DataFrame:
    """
    validで利用したdocumentのみを抽出し、train_correct_dfとvalid_pred_dfを結合して、documentごとに比較できるようにする
    """
    out = train_correct_df.filter(
        pl.col("document").is_in(valid_pred_df["document"])
    ).join(valid_pred_df, on=["document", "token"], how="outer", suffix="_pred")

    joined_dfs = []
    for document in out["document"].unique().to_list():
        if document is None:
            continue
        joined_df_per_document = out.filter(
            (pl.col("document") == document) | (pl.col("document_pred") == document)
        )
        joined_dfs.append(joined_df_per_document)

    return pl.concat(joined_dfs)

In [40]:
# main
valid_dataset = train_dataset.filter(
    lambda example: example["document"] in train_valid_dataset["valid"]["document"]
)

valid_dataset = valid_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

# valid_dataset = valid_dataset.map(
#     tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": TRAINING_MAX_LENGTH}, num_proc=3
# )

valid_preds = get_valid_preds(trainer, valid_dataset)

document, token, label, token_str = get_output_part(valid_preds, valid_dataset)

valid_pred_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)

train_correct_df = make_correct_df(train)

valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)

# wandbにuploadする
if not DEBUG:
    tbl = wandb.Table(data=valid_correct_pred_df.to_pandas())
    wandb.log({"valid_correct_pred_df": tbl})

Filter:   0%|          | 0/6807 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/1362 [00:00<?, ? examples/s]

In [41]:
valid_correct_pred_df

document,token,label,token_str,document_pred,token_pred,label_pred,token_str_pred
i64,i64,str,str,i64,i64,str,str
16,4,"""Gilberto""","""B-NAME_STUDENT…",16,4,"""B-NAME_STUDENT…","""Gilberto"""
16,5,"""Gamboa""","""I-NAME_STUDENT…",16,5,"""I-NAME_STUDENT…","""Gamboa"""
166,0,"""Pepa""","""B-NAME_STUDENT…",166,0,"""B-NAME_STUDENT…","""Pepa"""
166,1,"""Medrano""","""I-NAME_STUDENT…",166,1,"""I-NAME_STUDENT…","""Medrano"""
288,0,"""Rajinder""","""B-NAME_STUDENT…",288,0,"""B-NAME_STUDENT…","""Rajinder"""
288,1,"""Santos""","""I-NAME_STUDENT…",288,1,"""I-NAME_STUDENT…","""Santos"""
330,18,"""Davide""","""B-NAME_STUDENT…",330,18,"""B-NAME_STUDENT…","""Davide"""
330,19,"""Carletti""","""I-NAME_STUDENT…",330,19,"""I-NAME_STUDENT…","""Carletti"""
,,,,330,24,"""B-NAME_STUDENT…","""Marias"""
,,,,330,25,"""I-NAME_STUDENT…","""Gamesa"""


In [76]:
valid_dataset = train_valid_dataset["valid"]
double_token_data = valid_dataset.filter(lambda x: x["document"] == 4438)

double_token_data = double_token_data.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

double_token_preds = get_valid_preds(trainer, double_token_data)
document, token, label, token_str = get_output_part(
    double_token_preds, double_token_data
)
double_token_pred_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)
double_token_pred_df

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Token: [CLS], Offset: (0, 0)
Token: ▁kelly, Offset: (0, 5)
Token: harris, Offset: (5, 11)
Token: on, Offset: (11, 13)
Token: @, Offset: (13, 14)
Token: gmail, Offset: (14, 19)
Token: ., Offset: (19, 20)
Token: com, Offset: (20, 23)
Token: [SEP], Offset: (0, 0)


: 

# Data Upload

In [42]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/pll_data_detection/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [43]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json


def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


if (not DEBUG) and UPLOAD_DATA:
    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

In [44]:
MODEL_OUTPUT_PATH

'pll_data_detection/trained_models/e04-match-tokenize'

In [45]:
if not DEBUG:
    wandb.finish()

0,1
eval/accuracy,0.99962
eval/f1,0.81061
eval/f5score,0.84132
eval/loss,0.00101
eval/precision,0.77978
eval/recall,0.84399
eval/runtime,63.6501
eval/samples_per_second,21.398
eval/steps_per_second,1.351
train/epoch,3.0
