# 目的
- オリジナルデータを全て学習に使う
- 全てOのデータは学習・評価に使わない
- lrを2e-05→1e-05に変更する

In [1]:
EXP_NAME = "e056-fine-tuning-all-wo-all-O-1e"
MODEL_NAME = "microsoft/deberta-v3-large"

DATA_PATH = "pll_data_detection/data"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
if len(DATASET_NAME) < 6 or len(DATASET_NAME) > 50:
    raise Exception(f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字")
LOG_PATH = f"pll_data_detection/log/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"pll_data_detection/trained_models/{EXP_NAME}"

DEBUG = False
UPLOAD_DATA = True
TRAINING = True

# VALID_DATA_SIZE = 0.5
SEED = 42
EPOCH = 3

In [2]:
import os


def resolve_path(base_path: str) -> str:
    cwd = os.getcwd()
    if cwd == "/notebooks":
        print("Jupyter Kernel By VSCode or nohup!")
        return base_path
    elif cwd == "/notebooks/pll_data_detection/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)

In [3]:
print(MODEL_OUTPUT_PATH)

# Install

In [4]:
%pip install -q polars==0.20.10
%pip install -q transformers==4.37.2
%pip install -q datasets==2.16.1
%pip install -q evaluate==0.4.1
%pip install -q seqeval==1.2.2
%pip install -q accelerate
%pip install -q python-dotenv
%pip install -q wandb==0.16.3

# formatter
%pip install -q black isort

# Import

In [5]:
import json
import numpy as np
import polars as pl
import torch
import wandb
import random
from datasets import DatasetDict, load_dataset, concatenate_datasets, Value, ClassLabel
from seqeval.metrics.sequence_labeling import precision_recall_fscore_support
from tqdm.auto import tqdm
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)
from tokenizers import AddedToken

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [6]:
import transformers
import datasets
import evaluate

assert transformers.__version__ == "4.37.2"
assert datasets.__version__ == "2.16.1"
assert evaluate.__version__ == "0.4.1"

In [7]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

# Wandb

In [8]:
from dotenv import load_dotenv

if not DEBUG:
    load_dotenv(f"{DATA_PATH}/.env")
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project="pll", name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

'wandb'

# Check Environment

In [9]:
!python --version

In [10]:
!nvidia-smi

# Data Load

In [11]:
train_dataset = (
    load_dataset(
        "json", data_files=f"{DATA_PATH}/train.json", split="train"
    ).rename_column("labels", "provided_labels")
    # documentをstringに変換
    .cast_column("document", Value(dtype="string", id=None))
    # 識別のためのflagを追加
    .map(lambda example: {"flag": "original"})
)

test_dataset = load_dataset(
    "json", data_files={"test": f"{DATA_PATH}/test.json"}, split="test"
)

Generating train split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/6807 [00:00<?, ? examples/s]

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
train = pl.read_json(f"{DATA_PATH}/train.json")
train.head()

document,full_text,tokens,trailing_whitespace,labels
i64,str,list[str],list[bool],list[str]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]","[""O"", ""O"", … ""O""]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]","[""O"", ""O"", … ""O""]"


## 外部データの読み込み

In [13]:
def convert_string_to_list(example, col):
    # 'your_column_name'は変換したい列の名前に置き換えてください
    example[col] = eval(example[col])
    return example


# external_pii_dataset = (
#     load_dataset(
#         "csv",
#         data_files={"train": f"{DATA_PATH}/pii_dataset_fixed.csv"},
#         split="train",
#     )
#     .select_columns(["document", "labels", "text", "trailing_whitespace", "tokens"])
#     .rename_columns({"labels": "provided_labels", "text": "full_text"})
#     .map(convert_string_to_list, fn_kwargs={"col": "provided_labels"}, num_proc=3)
#     .map(convert_string_to_list, fn_kwargs={"col": "trailing_whitespace"}, num_proc=3)
#     .map(convert_string_to_list, fn_kwargs={"col": "tokens"}, num_proc=3)
#     .map(lambda example: {"flag": "external"}, num_proc=3)
# )

moredata_pii_dataset = (
    load_dataset(
        "csv",
        data_files={"train": f"{DATA_PATH}/moredata_dataset_fixed.csv"},
        split="train",
    )
    .rename_columns({"labels": "provided_labels", "text": "full_text"})
    .map(convert_string_to_list, fn_kwargs={"col": "provided_labels"}, num_proc=3)
    .map(convert_string_to_list, fn_kwargs={"col": "trailing_whitespace"}, num_proc=3)
    .map(convert_string_to_list, fn_kwargs={"col": "tokens"}, num_proc=3)
    .map(lambda example: {"flag": "moredata"}, num_proc=3)
)

# mixtral
mixtral = (
    load_dataset(
        "json", data_files=f"{DATA_PATH}/mixtral-8x7b-v1.json", split="train"
    ).rename_column("labels", "provided_labels")
    # 識別のためのflagを追加
    .map(lambda example: {"flag": "mixtral"})
)

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2355 [00:00<?, ? examples/s]

In [14]:
print("train_dataset")
print(train_dataset)

# print("external_pii_dataset")
# print(external_pii_dataset)

print("moredata_pii_dataset")
print(moredata_pii_dataset)

print("mixtral")
print(mixtral)

In [15]:
# 外部データと結合
train_dataset = concatenate_datasets([train_dataset, mixtral, moredata_pii_dataset])

In [16]:
print("all train")
print(train_dataset)

In [17]:
# shuffle
train_dataset = train_dataset.shuffle(seed=42)

In [18]:
# debug
if DEBUG:
    train_dataset = train_dataset.select(range(300))
    EPOCH = 1
    print(train_dataset)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens(AddedToken("\n", normalized=False))  # \nを一つのtokenとして扱う
# （これをしない場合、\nは文字の先頭の_として扱われている。
# 前: \nSaito → "_Saito"
# 後: \nSaito → "\n", "Saito"

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

1

In [20]:
# labelを変換する
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

label2id = {v: k for k, v in id2label.items()}

In [21]:
def tokenize(example, tokenizer, label2id):
    """
    与えられたtokenとlabelから、
    今回のtokenizerで区切った場合のtokenとlabelを作成する。
    """
    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        # 文字数分だけ、該当のラベルを追加する
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")
        # text -> ['Design', ' ']
        # labels -> ['O', 'O', 'O', 'O', 'O', 'O', 'O'] (6文字分 + 空白1文字分)

    # actual tokenization
    # tokenized = tokenizer(
    #     "".join(text),
    #     return_offsets_mapping=True,
    #     max_length=max_length,
    #     truncation=True,
    # )
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        truncation=False,
    )

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # offset_mappingの各エントリは、トークンが元のテキストのどの範囲（開始位置と終了位置）にマッピングされるかを示す
        # タプルまたはリストで構成されます。

        # CLS tokenの対応
        # CLSやSEPには必ず(start_idx, end_idx) = (0, 0)が割り当てられる
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # 空白が存在する時は、offset_mappingのstart_idxを+1する
        # DeBERTaV2Tokenizerは、空白・改行( と\n)を文字の先頭に▁としてくっつけるため。
        # NOTE: もし空白を▁として文字の先頭にくっつけるないtokenizerの場合は、不要
        # if text[start_idx].isspace():
        #     start_idx += 1
        # \nはisspace()に該当する。
        # special tokenとして扱う場合はstart_idx += 1しないようにするため。
        if text[start_idx] == " ":
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    # Q: token_labelsは何の長さ？
    # A: 今回のtokenizerで区切った時の、tokenに該当するlabel
    # 例:
    # 与えられたtoken example["tokens"][:10]
    # -> ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
    # 今回のtokenizerで区切ったtoken　tokenizer.convert_ids_to_tokens(tokenized.input_ids[:10])
    # -> ['[CLS]', '▁Design', '▁Thinking', '▁for', '▁innovation', '▁reflex', 'ion', '-', 'Av', 'ril']
    # 文字数が違う！！！
    # 最初に与えられたtokenとそのlabelだと、今回のtokenizerで区切った場合のラベルが分からない。
    # そのため、今回のtokenizerで区切った場合のtokenとラベルを作成した。

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}


train_dataset = train_dataset.map(
    tokenize,
    fn_kwargs={"tokenizer": tokenizer, "label2id": label2id},
    num_proc=3,
)

Map (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

In [22]:
# Check aligh token label
# for input_id, label in zip(train_dataset["input_ids"][0], train_dataset["labels"][0]):
#     print(str(tokenizer.convert_ids_to_tokens(input_id)), id2label[label])

In [23]:
label_list = list(label2id.keys())
label_list

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-EMAIL',
 'I-EMAIL',
 'B-USERNAME',
 'I-USERNAME',
 'B-ID_NUM',
 'I-ID_NUM',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-URL_PERSONAL',
 'I-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [24]:
len(label_list)

15

In [25]:
# pad_to_multiple_of
# paddingの際に、指定した数の倍数になるように、各サンプルの長さを揃える
# ハードウェアの要件に合致することで、計算効率が良くなる可能性がある
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=16
)

In [26]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [27]:
def f5_score(precision: float, recall: float, beta: int = 5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


def compute_metrics(p):
    """
    DeBERTa Tokenizerでの区切りを用いた評価値
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value

    # f5scoreを追加
    final_results["f5score"] = f5_score(
        results["overall_precision"], results["overall_recall"]
    )

    return final_results

    # # seqevalのmetrics関数を使用して、精度、再現率、F1スコア、正解率を計算
    # precision = results["overall_precision"]
    # recall = results["overall_recall"]

    # return {
    #     "precision": precision,
    #     "recall": recall,
    #     "f1": results["overall_f1"],
    #     "accuracy": results["overall_accuracy"],
    #     "f5score": f5_score(precision, recall),
    # }

In [28]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Embedding(128016, 1024)

In [29]:
train_dataset

Dataset({
    features: ['full_text', 'tokens', 'trailing_whitespace', 'document', 'provided_labels', 'flag', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'],
    num_rows: 11162
})

In [30]:
with open(f"{DATA_PATH}/document_pattern_dict.json") as f:
    document_pattern_dict = json.load(f)


def document_id_to_pattern(example) -> str:
    return document_pattern_dict[example["document"]]


def add_pattern_column(dataset):
    # original_datasetに対し、patternを付与する
    dataset = dataset.map(
        lambda example: {"pattern": document_id_to_pattern(example)}, num_proc=3
    )
    unique_labels = np.unique(dataset["pattern"])

    # train_test_splitのstratifyに使うために、ClassLabelに変換する
    class_label_feature = ClassLabel(names=unique_labels.tolist())
    return dataset.cast_column("pattern", class_label_feature)

def remove_all_o(train_dataset):
    # 'flag'列が'original'のもののみをテストデータとする
    original_dataset = train_dataset.filter(
        lambda x: x["flag"] == "original", num_proc=3
    )
    extrenal_dataset = train_dataset.filter(
        lambda x: x["flag"] != "original", num_proc=3
    )

    # pattern列を付与する
    original_dataset = add_pattern_column(original_dataset)

    # 全てOのデータを除外する
    original_dataset = original_dataset.filter(
        lambda x: x["pattern"] != 0, num_proc=3
    )

    # 再結合する
    concat_train_dataset = concatenate_datasets(
        [original_dataset, extrenal_dataset]
    )

    return concat_train_dataset

train_dataset = remove_all_o(train_dataset)

# def train_valid_split(train_dataset):
#     # 'flag'列が'original'のもののみをテストデータとする
#     original_dataset = train_dataset.filter(
#         lambda x: x["flag"] == "original", num_proc=3
#     )
#     extrenal_dataset = train_dataset.filter(
#         lambda x: x["flag"] != "original", num_proc=3
#     )

#     # pattern列を付与する
#     original_dataset = add_pattern_column(original_dataset)

#     # pattern列に対してstratifyになるよう分割する
#     train_split_dataset = original_dataset.train_test_split(
#         test_size=VALID_DATA_SIZE, seed=42, stratify_by_column="pattern"
#     )

#     # trainについて、全てOのデータを除外する
#     # train_split_dataset["train"] = train_split_dataset["train"].filter(
#     #     lambda x: x["pattern"] != 0, num_proc=3
#     # )

#     # validについて、全てOのデータを除外する
#     # train_split_dataset["test"] = train_split_dataset["test"].filter(
#     #     lambda x: x["pattern"] != 0, num_proc=3
#     # )

#     # 'flag'列が'original'でないものを訓練データと検証データに分割する
#     # train_split_dataset = original_dataset.train_test_split(test_size=0.2, seed=42)

#     # 再結合する
#     concat_train_dataset = concatenate_datasets(
#         [train_split_dataset["train"], extrenal_dataset]
#     )

#     train_valid_dataset = DatasetDict(
#         {"train": concat_train_dataset, "valid": train_split_dataset["test"]}
#     )
#     return train_valid_dataset


# if not DEBUG:
#     train_valid_dataset = train_valid_split(train_dataset)
# else:
#     original_dataset = train_dataset.filter(lambda x: x["flag"] == "original")
#     extrenal_dataset = train_dataset.filter(lambda x: x["flag"] != "original")

#     train_split_dataset = original_dataset.train_test_split(
#         test_size=VALID_DATA_SIZE, seed=42
#     )

#     concat_train_dataset = concatenate_datasets(
#         [train_split_dataset["train"], extrenal_dataset]
#     )
#     train_valid_dataset = DatasetDict(
#         {"train": concat_train_dataset, "valid": train_split_dataset["test"]}
#     )

Filter (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

Filter (num_proc=3):   0%|          | 0/11162 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/6807 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6807 [00:00<?, ? examples/s]

Filter (num_proc=3):   0%|          | 0/6807 [00:00<?, ? examples/s]

In [31]:
print(train_dataset)

In [32]:
training_args = TrainingArguments(
    output_dir=LOG_PATH,
    learning_rate=1e-5,
    per_device_train_batch_size=2,  # 16,  # 32はだめ、性能がepoch0の時、precision0.31→0.00、Recall0.25→0.00に落ちる
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=2,  # 16,  # 32,↑同様、バッチサイズが非常に重要なパラメーターであるとも言える
    num_train_epochs=3,  # 3,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    evaluation_strategy="no",
    do_eval=False,
    save_strategy="epoch",
    # load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model="f5score",  # add
    greater_is_better=True,  # add
    warmup_ratio=0.1,  # add
    lr_scheduler_type="cosine",  # add
    report_to=REPORT_TO,
    fp16=True,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # train_dataset=train_valid_dataset["train"],
    # eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
if TRAINING:
    # モデルの学習
    trainer.train()
    # cv_score = trainer.evaluate()["eval_f5score"]
    # モデルの保存
    trainer.save_model(MODEL_OUTPUT_PATH)

Step,Training Loss
500,0.3825
1000,0.0052
1500,0.0061
2000,0.0038
2500,0.0036
3000,0.0023
3500,0.0032


In [34]:
# model = AutoModelForTokenClassification.from_pretrained(MODEL_OUTPUT_PATH)

# tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_PATH)

# args = TrainingArguments(
#     ".",
#     per_device_eval_batch_size=1,
#     report_to="none",
# )

# trainer = Trainer(
#     model=model,
#     args=args,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )

# Make CV DataFrame

In [35]:
# def get_valid_preds(trainer, valid_dataset):
#     """
#     trainerを用いてvalid_datasetに対する予測を行う
#     """
#     predictions = trainer.predict(valid_dataset).predictions
#     preds_final = predictions.argmax(-1)

#     return preds_final


# # def get_valid_preds_with_pp(trainer: Trainer, valid_dataset, threhosld: float):
# #     """
# #     trainerを用いてvalid_datasetに対する予測を行う
# #     """
# #     predictions = trainer.predict(valid_dataset).predictions
# #     pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
# #         predictions.shape[0], predictions.shape[1], 1
# #     )

# #     preds = predictions.argmax(-1)
# #     preds_without_O = pred_softmax[:, :, 1:].argmax(-1) + 1
# #     O_preds = pred_softmax[:, :, 0]

# #     preds_final = np.where(O_preds < threhosld, preds_without_O, preds)

# #     return preds_final


# # def get_valid_preds_with_pp_for_opt(trainer: Trainer, valid_dataset):
# #     """
# #     trainerを用いてvalid_datasetに対する予測を行う
# #     """
# #     predictions = trainer.predict(valid_dataset).predictions
# #     pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
# #         predictions.shape[0], predictions.shape[1], 1
# #     )

# #     preds = predictions.argmax(-1)
# #     preds_without_O = pred_softmax[:, :, 1:].argmax(-1) + 1
# #     O_preds = pred_softmax[:, :, 0]

# #     return O_preds, preds_without_O, preds

In [36]:
# def tokenize(example, tokenizer):
#     text = []
#     token_map = []

#     idx = 0

#     for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
#         text.append(t)
#         token_map.extend([idx] * len(t))
#         if ws:
#             text.append(" ")
#             token_map.append(-1)

#         idx += 1

#     # tokenized = tokenizer(
#     #     "".join(text),
#     #     return_offsets_mapping=True,
#     #     truncation=True,
#     #     max_length=INFERENCE_MAX_LENGTH,
#     # )

#     tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)

#     return {
#         **tokenized,
#         "token_map": token_map,
#     }

In [37]:
# def get_output_part(preds_final, valid_dataset):
#     # triplets = []
#     document, token, label, token_str = [], [], [], []
#     # token_mapはoffsetsの文字列indexが、何番目のtokenの紐付け
#     # start_idx = 0の時、tokens[token_map[start_idx]] → 'Design'
#     # start_idx = 1の時、tokens[token_map[start_idx]] → 'Design'
#     # ・・・
#     # start_idx = 5の時、tokens[token_map[start_idx]] → 'Design'
#     # start_idx = 6の時、tokens[token_map[start_idx]] → '\n\n'

#     # 同じtoken_idに、二つの予測結果が入ってしまう問題
#     # 原因は、「文字列をtokenizerで区切り、別のlabelと予測した場合を、別のtripletとして扱うから」
#     # 例えば、'kellyharrison@gmail.com'のlabelの予測結果
#     # '\nkelly' -> 'B-EMAIL'
#     # '##harris' -> 'B-EMAIL'
#     # '##on' -> 'I-NAME_STUDENT'
#     # '@' -> 'B-EMAIL'
#     # 'gmail' -> 'B-EMAIL'
#     # '.' -> 'B-EMAIL'
#     # 'com' -> 'B-EMAIL'
#     # 解決策案: 同じtoken_idの場合は、tripletに追加しない

#     for p, token_map, offsets, tokens, doc in zip(
#         preds_final,
#         valid_dataset["token_map"],
#         valid_dataset["offset_mapping"],
#         valid_dataset["tokens"],
#         valid_dataset["document"],
#     ):
#         triplets = []
#         for token_pred, (start_idx, end_idx) in zip(p, offsets):
#             label_pred = id2label[token_pred]

#             if start_idx + end_idx == 0:
#                 continue

#             if token_map[start_idx] == -1:
#                 start_idx += 1

#             # ignore "\n\n"
#             # TODO: 答えに\nが入っている場合がありそうだけど、この処理は本当に問題ない？
#             # while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
#             #     start_idx += 1

#             # special tokenに\nを追加した場合は、\nのタイミングでstart_idxを=1したくないため
#             # while start_idx < len(token_map) and tokens[token_map[start_idx]] == " ":
#             #     start_idx += 1

#             # special tokenに\nを追加した場合は、\nのタイミングでstart_idxを=1したくないため
#             while (
#                 start_idx < len(token_map)
#                 and tokens[token_map[start_idx]].isspace()
#                 and tokens[token_map[start_idx]] != "\n"
#             ):
#                 start_idx += 1

#             if start_idx >= len(token_map):
#                 break

#             token_id = token_map[start_idx]

#             # ignore "O" predictions and whitespace preds
#             # if label_pred != "O" and token_id != -1:
#             # "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM"についてはpostprocessを行う
#             if (
#                 label_pred
#                 not in (
#                     "O",
#                     "B-EMAIL",
#                     "B-PHONE_NUM",
#                     "I-PHONE_NUM",
#                     "B-URL_PERSONAL",
#                     "I-URL_PERSONAL",
#                 )
#                 and token_id != -1
#             ):
#                 triplet = (label_pred, token_id, tokens[token_id])

#                 if triplet not in triplets:
#                     if (
#                         len(triplets) >= 1
#                         and document[-1] == doc
#                         and token[-1] == token_id
#                     ):
#                         continue
#                     document.append(doc)
#                     token.append(token_id)
#                     label.append(label_pred)
#                     token_str.append(tokens[token_id])
#                     triplets.append(triplet)

#     return document, token, label, token_str

In [38]:
# from spacy.lang.en import English

# nlp = English()


# def find_span(target: list[str], document: list[str]) -> list[list[int]]:
#     idx = 0
#     spans = []
#     span = []

#     for i, token in enumerate(document):
#         if token != target[idx]:
#             idx = 0
#             span = []
#             continue
#         span.append(i)
#         idx += 1
#         if idx == len(target):
#             spans.append(span)
#             span = []
#             idx = 0
#             continue

#     return spans

In [39]:
# valid_dataset = train_dataset.filter(
#     lambda example: example["document"] in train_valid_dataset["valid"]["document"],
#     num_proc=3,
# )

In [40]:
# import re
# from typing import Optional


# def get_rulebase(regex: re.Pattern, data, name: str) -> Optional[list[dict]]:
#     output = []
#     matches = regex.findall(data["full_text"])

#     # NOTE: find_spanにおいて、同じ単語を全て見つけてしまうため、重複をなくす
#     matches = list(dict.fromkeys(matches))

#     if not matches:
#         return None

#     matched_spans = []
#     for match in matches:
#         target = [t.text for t in nlp.tokenizer(match)]
#         matched_spans.append(find_span(target, data["tokens"]))

#     for matched_span in matched_spans:
#         for one_token_span in matched_span:
#             for intermediate, token_idx in enumerate(one_token_span):
#                 if intermediate == 0:
#                     prefix = "B"
#                 else:
#                     prefix = "I"

#                 output.append(
#                     {
#                         "document": data["document"],
#                         "token": token_idx,
#                         "label": f"{prefix}-{name}",
#                         "token_str": data["tokens"][token_idx],
#                     }
#                 )

#     return output


# email_regex = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
# phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
# url_regex = re.compile(
#     r"https?://(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+){1,2}/(?:[a-zA-Z0-9-]+/)*(?:[a-zA-Z0-9-]+\.html|.*\.php|.*\.asp|.*\.jsp|\?v=[a-zA-Z0-9-_]+|.*\.htm|user/[a-zA-Z0-9-_]+|watch\?v=[a-zA-Z0-9-_]+|[a-zA-Z0-9-_]+|\?v=[a-zA-Z0-9-_]+|[a-zA-Z0-9-_]+|)?"
# )

# emails_tst = []
# emails = []
# phone_nums = []
# urls = []

# for _data in valid_dataset:
#     # email
#     if match := get_rulebase(email_regex, _data, "EMAIL"):
#         emails.extend(match)

#     # phone number
#     if match := get_rulebase(phone_num_regex, _data, "PHONE_NUM"):
#         phone_nums.extend(match)

#     # URL
#     if match := get_rulebase(url_regex, _data, "URL_PERSONAL"):
#         urls.extend(match)

# pp_data = [emails, phone_nums, urls]

In [41]:
# def make_correct_df(train: pl.DataFrame):
#     # 学習データから、outputと同様のデータフレームを作成する
#     outputs = []
#     for document_id, token, label in zip(
#         train["document"], train["tokens"], train["provided_labels"]
#     ):
#         for token, (token_str, label_one) in enumerate(zip(token, label)):
#             if label_one != "O":
#                 outputs.append((document_id, token, label_one, token_str))
#     return pl.DataFrame(outputs, schema=["document", "token", "label", "token_str"])

In [42]:
# def make_correct_pred_join_df(
#     train_correct_df: pl.DataFrame, valid_pred_df: pl.DataFrame
# ) -> pl.DataFrame:
#     """
#     validで利用したdocumentのみを抽出し、train_correct_dfとvalid_pred_dfを結合して、documentごとに比較できるようにする
#     """
#     out = train_correct_df.filter(
#         pl.col("document").is_in(valid_pred_df["document"])
#     ).join(valid_pred_df, on=["document", "token"], how="outer", suffix="_pred")

#     joined_dfs = []
#     for document in out["document"].unique().to_list():
#         if document is None:
#             continue
#         joined_df_per_document = out.filter(
#             (pl.col("document") == document) | (pl.col("document_pred") == document)
#         )
#         joined_dfs.append(joined_df_per_document)

#     return pl.concat(joined_dfs)

In [43]:
# # main
# valid_dataset = train_dataset.filter(
#     lambda example: example["document"] in train_valid_dataset["valid"]["document"],
#     num_proc=3,
# )

# valid_dataset = valid_dataset.map(
#     tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
# )

# # 閾値を緩める後処理を使わない場合
# valid_preds = get_valid_preds(trainer, valid_dataset)
# # 閾値を緩める後処理を使う場合
# # valid_preds = get_valid_preds_with_pp(trainer, valid_dataset, threhosld=0.90)

# document, token, label, token_str = get_output_part(valid_preds, valid_dataset)

# valid_pred_df = pl.DataFrame(
#     [document, token, label, token_str],
#     schema=["document", "token", "label", "token_str"],
# )

# postprocess_df = pl.concat([pl.DataFrame(val) for val in pp_data if val != []])
# valid_pred_df = pl.concat([valid_pred_df, postprocess_df])

# train_correct_df = make_correct_df(pl.from_pandas(train_dataset.to_pandas()))
# # train_correct_df = make_correct_df(train)

# valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)

In [44]:
# from seqeval.metrics import classification_report


# def calc_f5_score_from_valid_df(
#     valid_correct_pred_df: pl.DataFrame, train: pl.DataFrame
# ) -> tuple[dict, float]:
#     # trainのtokenの長さを追加する
#     pred_df_agg_with_len = (
#         valid_correct_pred_df.select(
#             pl.col("document_pred").cast(pl.Int64),
#             pl.col("token_pred").cast(pl.Int64),
#             pl.col("label_pred"),
#         )
#         .drop_nulls()
#         .sort("document_pred")
#         .group_by("document_pred")
#         .agg(
#             pl.col("token_pred"),
#             pl.col("label_pred"),
#         )
#         .join(
#             train.with_columns(
#                 pl.col("tokens").map_elements(len).alias("tokens_len"),
#             ).select(["document", "tokens_len", "labels"]),
#             left_on="document_pred",
#             right_on="document",
#             how="left",
#         )
#     )

#     # 推論したlabel列をOを含むtoken列へと変換する
#     label_pred_alls = []
#     for token_pred, label_pred, tokens_len in zip(
#         pred_df_agg_with_len["token_pred"],
#         pred_df_agg_with_len["label_pred"],
#         pred_df_agg_with_len["tokens_len"],
#     ):
#         label_pred_all = ["O" for _ in range(tokens_len)]
#         for token, label in zip(token_pred, label_pred):
#             label_pred_all[token] = label
#         label_pred_alls.append(label_pred_all)

#     actual_pred_df = pred_df_agg_with_len.with_columns(
#         pl.Series("label_pred_all", label_pred_alls)
#     ).select(["labels", "label_pred_all"])

#     # ログ用
#     print("classification report")
#     print(
#         classification_report(
#             actual_pred_df["labels"].to_list(),
#             actual_pred_df["label_pred_all"].to_list(),
#         )
#     )

#     cls_rep_dict = classification_report(
#         actual_pred_df["labels"].to_list(),
#         actual_pred_df["label_pred_all"].to_list(),
#         output_dict=True,
#     )

#     f5score_from_valid_df = precision_recall_fscore_support(
#         actual_pred_df["labels"].to_list(),
#         actual_pred_df["label_pred_all"].to_list(),
#         beta=5,
#         average="micro",
#     )[2]

#     print("classification_report")
#     print(cls_rep_dict)
#     print("f5score_from_valid_df")
#     print(f5score_from_valid_df)

#     return cls_rep_dict, f5score_from_valid_df


# cls_rep_dict, f5score_from_valid_df = calc_f5_score_from_valid_df(
#     valid_correct_pred_df, train
# )

# postprocessの閾値の最適化

In [45]:
# # 先に推論結果を取得する
# valid_dataset = train_dataset.filter(
#     lambda example: example["document"] in train_valid_dataset["valid"]["document"],
#     num_proc=3,
# )

# valid_dataset = valid_dataset.map(
#     tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
# )

# train_correct_df = make_correct_df(pl.from_pandas(train_dataset.to_pandas()))

# O_preds, preds_without_O, preds = get_valid_preds_with_pp_for_opt(
#     trainer, valid_dataset
# )

In [46]:
# # 0.50から0.99まで、0.01刻みで、valid_predsを取得する
# # TODO: 遅すぎる(10分ぐらいかかりそう)ので、実行を早くする
# best_f5score_from_valid_df = 0
# for thr in np.linspace(0.80, 1.00, 20):
#     # 推論
#     valid_preds = np.where(O_preds < thr, preds_without_O, preds)

#     # valid_correct_pred_dfを作成
#     document, token, label, token_str = get_output_part(
#         valid_preds, valid_dataset
#     )  # 遅い10秒くらい

#     valid_pred_df = pl.DataFrame(
#         [document, token, label, token_str],
#         schema=["document", "token", "label", "token_str"],
#     )

#     postprocess_df = pl.concat(
#         [pl.DataFrame(val) for val in [emails, phone_nums] if val != []]
#     )
#     valid_pred_df = pl.concat([valid_pred_df, postprocess_df])

#     valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)

#     cls_rep_dict, f5score_from_valid_df = calc_f5_score_from_valid_df(
#         valid_correct_pred_df, train
#     )  # 遅い10秒くらい
#     print(thr, f5score_from_valid_df)

#     if best_f5score_from_valid_df < f5score_from_valid_df:
#         best_cls_rep_dict = cls_rep_dict
#         best_f5score_from_valid_df = f5score_from_valid_df

# print(best_cls_rep_dict, best_f5score_from_valid_df)

# CVをwandbにuploadする

In [47]:
# if not DEBUG:
#     # valid_df
#     tbl = wandb.Table(data=valid_correct_pred_df.to_pandas())
#     wandb.log({"valid_correct_pred_df": tbl})

#     # valid score
#     wandb.run.summary["f5score_from_valid_df"] = f5score_from_valid_df

#     # classification report
#     table = wandb.Table(columns=["Tag", "Precision", "Recall", "F1Score", "Support"])
#     for name in cls_rep_dict.keys():
#         table.add_data(
#             name,
#             cls_rep_dict[name]["precision"],
#             cls_rep_dict[name]["recall"],
#             cls_rep_dict[name]["f1-score"],
#             cls_rep_dict[name]["support"],
#         )
#     wandb.log({"classification_report": table})

# Data Upload

In [48]:
%pip install kaggle

In [49]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/pll_data_detection/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [50]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json


def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


if (not DEBUG) and UPLOAD_DATA:
    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

In [51]:
print(MODEL_OUTPUT_PATH)

In [52]:
if not DEBUG:
    wandb.finish()