# 目的
- CVスコアを計算する
- Validデータに対する現在のモデルの出力を作成する(=1)
- 1をwandbに記録する https://docs.wandb.ai/ja/guides/track/log/working-with-csv

In [53]:
EXP_NAME = "e02-make-val-out"
MODEL_NAME = "distilbert-base-uncased"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME}"
LOG_PATH = f"pll_data_detection/log/{EXP_NAME}"
MODEL_OUTPUT_PATH = f"pll_data_detection/trained_models/{EXP_NAME}"
DEBUG = False
UPLOAD_DATA = True
REPORT_TO = None

# Install

In [2]:
%pip install polars
%pip install transformers==4.37.2
%pip install datasets==2.16.1
%pip install evaluate==0.4.1
%pip install seqeval==1.2.2
%pip install accelerate
%pip install python-dotenv
%pip install kaggle
%pip iinstall wandb==0.16.3

# formatter
%pip install black isort

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
ERROR: unknown command "iinstall" - maybe you meant "install"
Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


# Import

In [3]:
import os

import evaluate
import numpy as np
import polars as pl
import torch
import wandb
from datasets import DatasetDict, load_dataset
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm.auto import tqdm
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import transformers

assert transformers.__version__ == "4.37.2"

In [5]:
import datasets

assert datasets.__version__ == "2.16.1"

In [6]:
import evaluate

assert evaluate.__version__ == "0.4.1"

# Wandb

In [7]:
from dotenv import load_dotenv

if not DEBUG:
    load_dotenv("pll_data_detection/.env")
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project="pll", name=EXP_NAME)
    REPORT_TO = "wandb"

[34m[1mwandb[0m: Currently logged in as: [33msinchir0[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Check Environment

In [8]:
!python --version

Python 3.9.16


In [9]:
!nvidia-smi

Mon Feb 12 04:49:01 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:00:05.0 Off |                  Off |
| 30%   33C    P8    26W / 300W |   7571MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Data Load

In [10]:
# データ確認用
train = pl.read_json("pll_data_detection/train.json")
train.head()

document,full_text,tokens,trailing_whitespace,labels
i64,str,list[str],list[bool],list[str]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]","[""O"", ""O"", … ""O""]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]","[""O"", ""O"", … ""O""]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]","[""O"", ""O"", … ""O""]"


In [11]:
# データ確認用
test = pl.read_json("pll_data_detection/test.json")
test

document,full_text,tokens,trailing_whitespace
i64,str,list[str],list[bool]
7,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]"
10,"""Diego Estrada …","[""Diego"", ""Estrada"", … "" ""]","[true, false, … false]"
16,"""Reporting proc…","[""Reporting"", ""process"", … "" ""]","[true, false, … false]"
20,"""Design Thinkin…","[""Design"", ""Thinking"", … "" ""]","[true, true, … false]"
56,"""Assignment: V…","[""Assignment"", "":"", … "" ""]","[false, false, … false]"
86,"""Cheese Startup…","[""Cheese"", ""Startup"", … "" ""]","[true, true, … false]"
93,"""Silvia Villalo…","[""Silvia"", ""Villalobos"", … "" ""]","[true, false, … false]"
104,"""Storytelling …","[""Storytelling"", "" "", … "" ""]","[true, false, … false]"
112,"""Reflection – L…","[""Reflection"", ""–"", … "" ""]","[true, true, … false]"
123,"""Gandhi Institu…","[""Gandhi"", ""Institute"", … "" ""]","[true, true, … false]"


In [12]:
train_dataset = load_dataset(
    "json", data_files={"train": "pll_data_detection/train.json"}, split="train"
)
test_dataset = load_dataset(
    "json", data_files={"test": "pll_data_detection/test.json"}, split="test"
)

In [13]:
# debug
if DEBUG:
    train_dataset = train_dataset.select(range(100))

In [14]:
train_dataset

Dataset({
    features: ['tokens', 'full_text', 'trailing_whitespace', 'document', 'labels'],
    num_rows: 6807
})

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [16]:
# TODO: 512?が上限であるのに対し、NER対象のテキストが512よりも長い場合が多いため、適したモデルにする
# DeBERTaあたりは、512を超えるテキストに対応していたような
example = train_dataset[0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens[:10]

Token indices sequence length is longer than the specified maximum sequence length for this model (748 > 512). Running this sequence through the model will result in indexing errors


['[CLS]',
 'design',
 'thinking',
 'for',
 'innovation',
 'reflex',
 '##ion',
 '-',
 'av',
 '##ril']

In [17]:
# labelを変換する
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-EMAIL",
    4: "I-EMAIL",
    5: "B-USERNAME",
    6: "I-USERNAME",
    7: "B-ID_NUM",
    8: "I-ID_NUM",
    9: "B-PHONE_NUM",
    10: "I-PHONE_NUM",
    11: "B-URL_PERSONAL",
    12: "I-URL_PERSONAL",
    13: "B-STREET_ADDRESS",
    14: "I-STREET_ADDRESS",
}

label2id = {v: k for k, v in id2label.items()}


def label2id_func(example):
    example["labels"] = [label2id[tag] for tag in example["labels"]]
    return example


labele2id_train_dataset = train_dataset.map(label2id_func)

In [18]:
label_list = list(label2id.keys())
label_list

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-EMAIL',
 'I-EMAIL',
 'B-USERNAME',
 'I-USERNAME',
 'B-ID_NUM',
 'I-ID_NUM',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-URL_PERSONAL',
 'I-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        # トークン化されたシーケンス内の各トークンが元のテキスト内のどの単語に対応するかを示すIDを提供します。
        # 例えば、元のテキストが "Hello, world!" で、トークン化されたシーケンスが ["Hello", ",", "world", "!"] の場合
        # `word_ids`メソッドは [0, None, 1, None] を返します。
        # これは、"Hello" が最初の単語（インデックス0）、"," が単語に属さない（None）、"world" が2番目の単語（インデックス1）、"!" が単語に属さない（None）ことを示します。
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif (
                word_idx != previous_word_idx
            ):  # subwordの場合、同じword_idxが連続している。最初のtoken以外は-100にする
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_train_dataset = labele2id_train_dataset.map(
    tokenize_and_align_labels, batched=True
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [21]:
seqeval = evaluate.load("seqeval")

In [22]:
def f5_score(precision: float, recall: float, beta: int = 5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # seqevalのmetrics関数を使用して、精度、再現率、F1スコア、正解率を計算
    precision = results["overall_precision"]
    recall = results["overall_recall"]

    return {
        "precision": precision,
        "recall": recall,
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "f5score": f5_score(precision, recall),
    }

    # f1 = f1_score(true_labels, true_predictions)
    # accuracy = accuracy_score(true_labels, true_predictions)

    # return {
    #     "precision": precision,
    #     "recall": recall,
    #     "f1": f1,
    #     "accuracy": accuracy,
    #     "f5score": f5_score(precision, recall),  # F5スコアを追加
    # }

In [23]:
# seqeval = evaluate.load("seqeval")
# predictions = [
#     ["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
#     ["B-PER", "I-PER", "O"],
# ]
# references = [
#     ["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
#     ["B-PER", "I-PER", "O"],
# ]
# results = seqeval.compute(predictions=predictions, references=references)
# results

In [24]:
# results["recall"]

In [25]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# evalデータの用意
split_dataset = tokenized_train_dataset.train_test_split(test_size=0.2)
tokenized_train_valid_dataset = DatasetDict(
    {"train": split_dataset["train"], "valid": split_dataset["test"]}
)

In [27]:
training_args = TrainingArguments(
    output_dir=LOG_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # 32はだめ、性能がepoch0の時、precision0.31→0.00、Recall0.25→0.00に落ちる
    per_device_eval_batch_size=16,  # 32,↑同様、バッチサイズが非常に重要なパラメーターであるとも言える
    # num_train_epochs=2,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model="f5score",  # add
    greater_is_better=True,  # add
    warmup_ratio=0.1,  # add
    lr_scheduler_type="cosine",  # add
    report_to=REPORT_TO,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_valid_dataset["train"],
    eval_dataset=tokenized_train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
# モデルの保存
trainer.train()
cv_score = trainer.evaluate()["eval_f5score"]
trainer.save_model(MODEL_OUTPUT_PATH)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,F5score
1,No log,0.002866,0.09434,0.041494,0.057637,0.999427,0.042407
2,0.179500,0.001808,0.650794,0.510373,0.572093,0.999725,0.514644
3,0.001800,0.001673,0.665072,0.576763,0.617778,0.999746,0.579724


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


# Make CV DF

In [29]:
def get_valid_preds(trainer, valid_dataset):
    """
    trainerを用いてvalid_datasetに対する予測を行う
    """
    predictions = trainer.predict(valid_dataset).predictions
    preds_final = predictions.argmax(-1)

    return preds_final

In [30]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    # tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True)

    return {
        **tokenized,
        "token_map": token_map,
    }

In [31]:
def get_output_part(preds_final, valid_dataset):
    triplets = []
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(
        preds_final,
        valid_dataset["token_map"],
        valid_dataset["offset_mapping"],
        valid_dataset["tokens"],
        valid_dataset["document"],
    ):
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    return document, token, label, token_str

In [32]:
def make_correct_df(train: pl.DataFrame):
    # 学習データから、outputと同様のデータフレームを作成する
    outputs = []
    for document_id, token, label in zip(
        train["document"], train["tokens"], train["labels"]
    ):
        for token, (token_str, label_one) in enumerate(zip(token, label)):
            if label_one != "O":
                outputs.append((document_id, token, token_str, label_one))
    return pl.DataFrame(outputs, schema=["document", "token", "label", "token_str"])

In [33]:
def make_correct_pred_join_df(
    train_correct_df: pl.DataFrame, valid_pred_df: pl.DataFrame
) -> pl.DataFrame:
    """
    validで利用したdocumentのみを抽出し、train_correct_dfとvalid_pred_dfを結合して、documentごとに比較できるようにする
    """
    out = train_correct_df.filter(
        pl.col("document").is_in(valid_pred_df["document"])
    ).join(valid_pred_df, on=["document", "token"], how="outer", suffix="_pred")

    joined_dfs = []
    for document in out["document"].unique().to_list():
        if document is None:
            continue
        joined_df_per_document = out.filter(
            (pl.col("document") == document) | (pl.col("document_pred") == document)
        )
        joined_dfs.append(joined_df_per_document)

    return pl.concat(joined_dfs)

In [34]:
# main
valid_dataset = tokenized_train_valid_dataset["valid"]

valid_dataset = valid_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2
)

valid_preds = get_valid_preds(trainer, valid_dataset)

document, token, label, token_str = get_output_part(valid_preds, valid_dataset)

valid_pred_df = pl.DataFrame(
    [document, token, label, token_str],
    schema=["document", "token", "label", "token_str"],
)

train_correct_df = make_correct_df(train)

# wandbにuploadする
valid_correct_pred_df = make_correct_pred_join_df(train_correct_df, valid_pred_df)
tbl = wandb.Table(data=valid_correct_pred_df.to_pandas())
wandb.log({"valid_correct_pred_df": tbl})

Map (num_proc=2):   0%|          | 0/1362 [00:00<?, ? examples/s]

In [35]:
valid_correct_pred_df

document,token,label,token_str,document_pred,token_pred,label_pred,token_str_pred
i64,i64,str,str,i64,i64,str,str
123,32,"""Stefano""","""B-NAME_STUDENT…",123,32,"""B-NAME_STUDENT…","""Stefano"""
123,33,"""Lovato""","""I-NAME_STUDENT…",123,33,"""I-NAME_STUDENT…","""Lovato"""
,,,,123,35,"""I-NAME_STUDENT…","""MDI-191"""
330,18,"""Davide""","""B-NAME_STUDENT…",330,18,"""B-NAME_STUDENT…","""Davide"""
330,19,"""Carletti""","""I-NAME_STUDENT…",330,19,"""B-NAME_STUDENT…","""Carletti"""
,,,,330,24,"""B-NAME_STUDENT…","""Marias"""
,,,,330,316,"""B-NAME_STUDENT…","""Marias"""
333,20,"""Karan""","""B-NAME_STUDENT…",333,20,"""B-NAME_STUDENT…","""Karan"""
333,21,"""Patel""","""I-NAME_STUDENT…",333,21,"""I-NAME_STUDENT…","""Patel"""
344,7,"""Milton""","""B-NAME_STUDENT…",344,7,"""B-NAME_STUDENT…","""Milton"""


# Data Upload

In [57]:
import os

os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/pll_data_detection/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [58]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json


def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


if (not DEBUG) and UPLOAD_DATA:
    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e02-make-val-out-distilbert-base-uncased, output_dir:pll_data_detection/trained_models/e02-make-val-out
Starting upload for file training_args.bin
Upload successful: training_args.bin (4KB)
Starting upload for file vocab.txt
Upload successful: vocab.txt (226KB)
Starting upload for file tokenizer.json
Upload successful: tokenizer.json (695KB)
Starting upload for file config.json
Upload successful: config.json (1KB)
Starting upload for file tokenizer_config.json
Upload successful: tokenizer_config.json (1KB)
Starting upload for file model.safetensors
Upload successful: model.safetensors (253MB)
Starting upload for file special_tokens_map.json
Upload successful: special_tokens_map.json (125B)
