# 目的
- truncation=falseにする

In [1]:
EXP_NAME = "e037-fix-truncation-false"
MODEL_NAME = "microsoft/deberta-v3-base"
DATA_PATH = "pll_data_detection/data"  # vscode, nohup
# DATA_PATH = "../../pll_data_detection/data" # jupyterlab
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
if len(DATASET_NAME) < 6 or len(DATASET_NAME) > 50:
    raise Exception(f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字")
LOG_PATH = f"pll_data_detection/log/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"pll_data_detection/trained_models/{EXP_NAME}"
DEBUG = False
UPLOAD_DATA = True
# TRAINING_MAX_LENGTH = 1024  # NOTE: ほとんどOなのに、後半のtokenに正解があると、truncationで落ちてしまって勿体無い
# TRAINING_MAX_LENGTH = 2048  # NOTE: ほとんどOなのに、後半のtokenに正解があると、truncationで落ちてしまって勿体無い
# INFERENCE_MAX_LENGTH = 2048
VALID_DATA_SIZE = 0.5
SEED = 42

In [2]:
print(MODEL_OUTPUT_PATH)

# Install

In [3]:
%pip install polars==0.20.10
%pip install transformers==4.37.2
%pip install datasets==2.16.1
%pip install evaluate==0.4.1
%pip install seqeval==1.2.2
%pip install accelerate
%pip install python-dotenv
%pip install kaggle
%pip install wandb==0.16.3

# formatter
%pip install black isort

# Import

In [4]:
import os

import json
import ast
import evaluate
import numpy as np
import polars as pl
import torch
import wandb
import random
from datasets import DatasetDict, load_dataset, concatenate_datasets, Value, ClassLabel
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from seqeval.metrics.sequence_labeling import precision_recall_fscore_support
from tqdm.auto import tqdm
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
import transformers

assert transformers.__version__ == "4.37.2"

In [6]:
import datasets

assert datasets.__version__ == "2.16.1"

In [7]:
import evaluate

assert evaluate.__version__ == "0.4.1"

In [8]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(SEED)

# Wandb

In [9]:
from dotenv import load_dotenv

if not DEBUG:
    load_dotenv(f"{DATA_PATH}/.env")
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project="pll", name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

'wandb'

# Check Environment

In [10]:
!python --version

In [11]:
!nvidia-smi

# Data Load

In [12]:
train_dataset = (
    load_dataset(
        "json", data_files=f"{DATA_PATH}/train.json", split="train"
    ).rename_column("labels", "provided_labels")
    # documentをstringに変換
    .cast_column("document", Value(dtype="string", id=None))
    # 識別のためのflagを追加
    .map(lambda example: {"flag": "original"})
)

test_dataset = load_dataset(
    "json", data_files={"test": f"{DATA_PATH}/test.json"}, split="test"
)

Generating train split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/6807 [00:00<?, ? examples/s]

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [13]:
train = pl.read_json(f"{DATA_PATH}/train.json")
train.head()

document,full_text,tokens,trailing_whitespace,labels
i64,str,list[str],list[bool],list[str]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]","[""O"", ""O"", … ""O""]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]","[""O"", ""O"", … ""O""]"


In [14]:
# # データ確認用
# test = pl.read_json(f"{DATA_PATH}/test.json")

## 外部データの読み込み

In [15]:
def convert_string_to_list(example, col):
    # 'your_column_name'は変換したい列の名前に置き換えてください
    example[col] = eval(example[col])
    return example


# external_pii_dataset = (
#     load_dataset(
#         "csv",
#         data_files={"train": f"{DATA_PATH}/pii_dataset_fixed.csv"},
#         split="train",
#     )
#     .select_columns(["document", "labels", "text", "trailing_whitespace", "tokens"])
#     .rename_columns({"labels": "provided_labels", "text": "full_text"})
#     .map(convert_string_to_list, fn_kwargs={"col": "provided_labels"}, num_proc=3)
#     .map(convert_string_to_list, fn_kwargs={"col": "trailing_whitespace"}, num_proc=3)
#     .map(convert_string_to_list, fn_kwargs={"col": "tokens"}, num_proc=3)
#     .map(lambda example: {"flag": "external"}, num_proc=3)
# )

In [16]:
moredata_pii_dataset = (
    load_dataset(
        "csv",
        data_files={"train": f"{DATA_PATH}/moredata_dataset_fixed.csv"},
        split="train",
    )
    .rename_columns({"labels": "provided_labels", "text": "full_text"})
    .map(convert_string_to_list, fn_kwargs={"col": "provided_labels"}, num_proc=3)
    .map(convert_string_to_list, fn_kwargs={"col": "trailing_whitespace"}, num_proc=3)
    .map(convert_string_to_list, fn_kwargs={"col": "tokens"}, num_proc=3)
    .map(lambda example: {"flag": "moredata"}, num_proc=3)
)

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
moredata_pii_dataset.features

{'document': Value(dtype='string', id=None),
 'full_text': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'trailing_whitespace': Sequence(feature=Value(dtype='bool', id=None), length=-1, id=None),
 'provided_labels': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'flag': Value(dtype='string', id=None)}

In [18]:
# mixtral
mixtral = (
    load_dataset(
        "json", data_files=f"{DATA_PATH}/mixtral-8x7b-v1.json", split="train"
    ).rename_column("labels", "provided_labels")
    # 識別のためのflagを追加
    .map(lambda example: {"flag": "mixtral"})
)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2355 [00:00<?, ? examples/s]

In [19]:
train_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'flag'],
    num_rows: 6807
})

In [20]:
mixtral

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'flag'],
    num_rows: 2355
})

In [21]:
# 外部データと結合
# train_dataset = concatenate_datasets(
#     [train_dataset, external_pii_dataset, moredata_pii_dataset]
# )

# train_dataset = concatenate_datasets([train_dataset, moredata_pii_dataset])
# train_dataset = concatenate_datasets([train_dataset, mixtral])
train_dataset = concatenate_datasets([train_dataset, mixtral, moredata_pii_dataset])

In [22]:
train_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'flag'],
    num_rows: 11162
})

In [23]:
# shuffle
train_dataset = train_dataset.shuffle(seed=42)

In [24]:
# debug
if DEBUG:
    train_dataset = train_dataset.select(range(100))

In [25]:
test_dataset

Dataset({
    features: ['tokens', 'document', 'full_text', 'trailing_whitespace'],
    num_rows: 10
})

In [26]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [27]:
# labelを変換する
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

label2id = {v: k for k, v in id2label.items()}

In [28]:
# def tokenize(example, tokenizer, label2id, max_length):
def tokenize(example, tokenizer, label2id):
    """
    与えられたtokenとlabelから、
    今回のtokenizerで区切った場合のtokenとlabelを作成する。
    """
    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        # 文字数分だけ、該当のラベルを追加する
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")
        # text -> ['Design', ' ']
        # labels -> ['O', 'O', 'O', 'O', 'O', 'O', 'O'] (6文字分 + 空白1文字分)

    # actual tokenization
    # tokenized = tokenizer(
    #     "".join(text),
    #     return_offsets_mapping=True,
    #     max_length=max_length,
    #     truncation=True,
    # )
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=False,
    )

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # offset_mappingの各エントリは、トークンが元のテキストのどの範囲（開始位置と終了位置）にマッピングされるかを示す
        # タプルまたはリストで構成されます。

        # CLS tokenの対応
        # CLSやSEPには必ず(start_idx, end_idx) = (0, 0)が割り当てられる
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # 空白が存在する時は、offset_mappingのstart_idxを+1する
        # DeBERTaV2Tokenizerは、空白を文字の先頭に▁としてくっつけるため。
        # NOTE: もし空白を▁として文字の先頭にくっつけるないtokenizerの場合は、不要
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    # Q: token_labelsは何の長さ？
    # A: 今回のtokenizerで区切った時の、tokenに該当するlabel
    # 例:
    # 与えられたtoken example["tokens"][:10]
    # -> ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
    # 今回のtokenizerで区切ったtoken　tokenizer.convert_ids_to_tokens(tokenized.input_ids[:10])
    # -> ['[CLS]', '▁Design', '▁Thinking', '▁for', '▁innovation', '▁reflex', 'ion', '-', 'Av', 'ril']
    # 文字数が違う！！！
    # 最初に与えられたtokenとそのlabelだと、今回のtokenizerで区切った場合のラベルが分からない。
    # そのため、今回のtokenizerで区切った場合のtokenとラベルを作成した。

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}


train_dataset = train_dataset.map(
    tokenize,
    fn_kwargs={
        "tokenizer": tokenizer,
        "label2id": label2id
        # "max_length": TRAINING_MAX_LENGTH,
    },
    num_proc=3,
)

Map (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

In [29]:
label_list = list(label2id.keys())
label_list

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-EMAIL',
 'I-EMAIL',
 'B-USERNAME',
 'I-USERNAME',
 'B-ID_NUM',
 'I-ID_NUM',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-URL_PERSONAL',
 'I-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [30]:
len(label_list)

15

In [31]:
# pad_to_multiple_of
# paddingの際に、指定した数の倍数になるように、各サンプルの長さを揃える
# ハードウェアの要件に合致することで、計算効率が良くなる可能性がある
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=16
)

In [32]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [33]:
def f5_score(precision: float, recall: float, beta: int = 5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # seqevalのmetrics関数を使用して、精度、再現率、F1スコア、正解率を計算
    precision = results["overall_precision"]
    recall = results["overall_recall"]

    return {
        "precision": precision,
        "recall": recall,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "f5score": f5_score(precision, recall),
    }

In [34]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [35]:
train_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'flag', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'],
    num_rows: 11162
})

In [36]:
with open(f"{DATA_PATH}/document_pattern_dict.json") as f:
    document_pattern_dict = json.load(f)


def document_id_to_pattern(example) -> str:
    return document_pattern_dict[example["document"]]


def add_pattern_column(dataset):
    # original_datasetに対し、patternを付与する
    dataset = dataset.map(
        lambda example: {"pattern": document_id_to_pattern(example)}, num_proc=3
    )
    unique_labels = np.unique(dataset["pattern"])

    # train_test_splitのstratifyに使うために、ClassLabelに変換する
    class_label_feature = ClassLabel(names=unique_labels.tolist())
    return dataset.cast_column("pattern", class_label_feature)


def train_valid_split(train_dataset):
    # 'flag'列が'original'のもののみをテストデータとする
    original_dataset = train_dataset.filter(lambda x: x["flag"] == "original", num_proc=3)
    extrenal_dataset = train_dataset.filter(lambda x: x["flag"] != "original", num_proc=3)

    # pattern列を付与する
    original_dataset = add_pattern_column(original_dataset)
    
    # pattern列に対してstratifyになるよう分割する
    train_split_dataset = original_dataset.train_test_split(
        test_size=VALID_DATA_SIZE, seed=42, stratify_by_column="pattern"
    )

    # trainについて、全てOのデータを除外する
    train_split_dataset["train"] = train_split_dataset["train"].filter(lambda x: x["pattern"] != 0, num_proc=3)
    
    # 'flag'列が'original'でないものを訓練データと検証データに分割する
    # train_split_dataset = original_dataset.train_test_split(test_size=0.2, seed=42)

    # 再結合する
    concat_train_dataset = concatenate_datasets(
        [train_split_dataset["train"], extrenal_dataset]
    )

    train_valid_dataset = DatasetDict(
        {"train": concat_train_dataset, "valid": train_split_dataset["test"]}
    )
    return train_valid_dataset


if not DEBUG:
    train_valid_dataset = train_valid_split(train_dataset)
else:
    original_dataset = train_dataset.filter(lambda x: x["flag"] == "original")
    extrenal_dataset = train_dataset.filter(lambda x: x["flag"] != "original")

    train_split_dataset = original_dataset.train_test_split(test_size=VALID_DATA_SIZE, seed=42)

    concat_train_dataset = concatenate_datasets(
        [train_split_dataset["train"], extrenal_dataset]
    )
    train_valid_dataset = DatasetDict(
        {"train": concat_train_dataset, "valid": train_split_dataset["test"]}
    )

Filter (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

Filter (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/6807 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6807 [00:00<?, ? examples/s]

Filter (num_proc=3):   0%|          | 0/3403 [00:00<?, ? examples/s]

In [37]:
print(train_valid_dataset)

In [38]:
training_args = TrainingArguments(
    output_dir=LOG_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # 16,  # 32はだめ、性能がepoch0の時、precision0.31→0.00、Recall0.25→0.00に落ちる
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=4,  # 16,  # 32,↑同様、バッチサイズが非常に重要なパラメーターであるとも言える
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model="eval_loss",  # add
    # greater_is_better=True,  # add
    warmup_ratio=0.1,  # add
    lr_scheduler_type="cosine",  # add
    report_to=REPORT_TO,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [39]:
# モデルの学習
trainer.train()
cv_score = trainer.evaluate()["eval_f5score"]
# モデルの保存
trainer.save_model(MODEL_OUTPUT_PATH)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,F5score
0,0.2169,0.004508,0.409595,0.941613,0.570867,0.998666,0.89681
2,0.0023,0.00329,0.581894,0.905468,0.708484,0.999267,0.886508


In [40]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_OUTPUT_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_PATH)

args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Make CV DataFrame

In [41]:
def get_valid_preds(trainer, valid_dataset):
    """
    trainerを用いてvalid_datasetに対する予測を行う
    """
    predictions = trainer.predict(valid_dataset).predictions
    preds_final = predictions.argmax(-1)

    return preds_final

In [42]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    # tokenized = tokenizer(
    #     "".join(text),
    #     return_offsets_mapping=True,
    #     truncation=True,
    #     max_length=INFERENCE_MAX_LENGTH,
    # )
    
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=False
    )

    return {
        **tokenized,
        "token_map": token_map,
    }

In [43]:
def get_output_part(preds_final, valid_dataset):
    # triplets = []
    document, token, label, token_str = [], [], [], []
    # token_mapはoffsetsの文字列indexが、何番目のtokenの紐付け
    # start_idx = 0の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 1の時、tokens[token_map[start_idx]] → 'Design'
    # ・・・
    # start_idx = 5の時、tokens[token_map[start_idx]] → 'Design'
    # start_idx = 6の時、tokens[token_map[start_idx]] → '\n\n'

    # 同じtoken_idに、二つの予測結果が入ってしまう問題
    # 原因は、「文字列をtokenizerで区切り、別のlabelと予測した場合を、別のtripletとして扱うから」
    # 例えば、'kellyharrison@gmail.com'のlabelの予測結果
    # '\nkelly' -> 'B-EMAIL'
    # '##harris' -> 'B-EMAIL'
    # '##on' -> 'I-NAME_STUDENT'
    # '@' -> 'B-EMAIL'
    # 'gmail' -> 'B-EMAIL'
    # '.' -> 'B-EMAIL'
    # 'com' -> 'B-EMAIL'
    # 簡易的な解決策: 同じtoken_idの場合は、tripletに追加しない
    # 根本的な解決策: サブワードの推論結果は無視する？（サブワードの話ではないかも。もうちょっと考えたほうがよさそう。）

    for p, token_map, offsets, tokens, doc in zip(
        preds_final,
        valid_dataset["token_map"],
        valid_dataset["offset_mapping"],
        valid_dataset["tokens"],
        valid_dataset["document"],
    ):
        triplets = []
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            # TODO: 答えに\nが入っている場合がありそうだけど、この処理は本当に問題ない？
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    return document, token, label, token_str

In [44]:
def make_correct_df(train: pl.DataFrame):
    # 学習データから、outputと同様のデータフレームを作成する
    outputs = []
    for document_id, token, label in zip(
        train["document"], train["tokens"], train["provided_labels"]
    ):
        for token, (token_str, label_one) in enumerate(zip(token, label)):
            if label_one != "O":
                outputs.append((document_id, token, label_one, token_str))
    return pl.DataFrame(outputs, schema=["document", "token", "label", "token_str"])

In [45]:
def make_correct_pred_join_df(
    train_correct_df: pl.DataFrame, valid_pred_df: pl.DataFrame
) -> pl.DataFrame:
    """
    validで利用したdocumentのみを抽出し、train_correct_dfとvalid_pred_dfを結合して、documentごとに比較できるようにする
    """
    out = train_correct_df.filter(
        pl.col("document").is_in(valid_pred_df["document"])
    ).join(valid_pred_df, on=["document", "token"], how="outer", suffix="_pred")

    joined_dfs = []
    for document in out["document"].unique().to_list():
        if document is None:
            continue
        joined_df_per_document = out.filter(
            (pl.col("document") == document) | (pl.col("document_pred") == document)
        )
        joined_dfs.append(joined_df_per_document)

    return pl.concat(joined_dfs)

In [46]:
# main
valid_dataset = train_dataset.filter(
    lambda example: example["document"] in train_valid_dataset["valid"]["document"],
    num_proc=3,
)

valid_dataset = valid_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
)

valid_preds = get_valid_preds(trainer, valid_dataset)

document, token, label, token_str = get_output_part(valid_preds, valid_dataset)

valid_pred_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)

train_correct_df = make_correct_df(pl.from_pandas(train_dataset.to_pandas()))
# train_correct_df = make_correct_df(train)

valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)

# wandbにuploadする
if not DEBUG:
    tbl = wandb.Table(data=valid_correct_pred_df.to_pandas())
    wandb.log({"valid_correct_pred_df": tbl})

Filter (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/3404 [00:00<?, ? examples/s]

# Calc CV Score From Valid Df

In [47]:
def calc_f5_score_from_valid_df(valid_df: pl.DataFrame, train: pl.DataFrame) -> float:
    # trainのtokenの長さを追加する
    pred_df_agg_with_len = (
        valid_correct_pred_df.select(
            pl.col("document_pred").cast(pl.Int64),
            pl.col("token_pred").cast(pl.Int64),
            pl.col("label_pred"),
        )
        .drop_nulls()
        .sort("document_pred")
        .group_by("document_pred")
        .agg(
            pl.col("token_pred"),
            pl.col("label_pred"),
        )
        .join(
            train.with_columns(
                pl.col("tokens").map_elements(len).alias("tokens_len"),
            ).select(["document", "tokens_len", "labels"]),
            left_on="document_pred",
            right_on="document",
            how="left",
        )
    )

    # 推論したlabel列をOを含むtoken列へと変換する
    label_pred_alls = []
    for token_pred, label_pred, tokens_len in zip(
        pred_df_agg_with_len["token_pred"],
        pred_df_agg_with_len["label_pred"],
        pred_df_agg_with_len["tokens_len"],
    ):
        label_pred_all = ["O" for _ in range(tokens_len)]
        for token, label in zip(token_pred, label_pred):
            label_pred_all[token] = label
        label_pred_alls.append(label_pred_all)

    actual_pred_df = pred_df_agg_with_len.with_columns(
        pl.Series("label_pred_all", label_pred_alls)
    ).select(["labels", "label_pred_all"])

    return precision_recall_fscore_support(
        actual_pred_df["labels"].to_list(),
        actual_pred_df["label_pred_all"].to_list(),
        beta=5,
        average="micro",
    )[2]


f5score_from_valid_df = calc_f5_score_from_valid_df(valid_correct_pred_df, train)

if not DEBUG:
    wandb.run.summary["f5score_from_valid_df"] = f5score_from_valid_df

f5score_from_valid_df

0.9075898030127463

# Data Upload

In [48]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/pll_data_detection/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [49]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json


def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


if (not DEBUG) and UPLOAD_DATA:
    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

In [50]:
print(MODEL_OUTPUT_PATH)

In [51]:
if not DEBUG:
    wandb.finish()