In [1]:
import os
os.chdir("../")

In [2]:
from itertools import chain
from functools import partial
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    set_seed,
)
import pandas as pd
from types import SimpleNamespace
import torch
import wandb

In [3]:
from src.metric import (
    compute_metrics,
    get_f5_at_different_thresholds,
)
from src.data import create_dataset
from src.utils import (
    get_reference_df_parquet,
    parse_predictions,
    filter_errors,
    generate_htmls_concurrently,
    visualize,
    convert_for_upload,
    CustomTrainer,
    parse_args,
)

In [4]:
MODEL_SIZE = "base"

In [5]:
MAX_LENGTH = 1024
WANDB_PROJECT = "Kaggle-PII"
USER_NAME = "shakleenishfar"
PROJECT_PATH = f"laplacesdemon43/{WANDB_PROJECT}"
EXPERIMENT = f"pii-sweep-001"
WANDB_NAME = f"DeBERTA-v3-{MODEL_SIZE}-{MAX_LENGTH}-Sweep"

In [6]:
wandb.login(key="0bf204609ea345c7c595565d736a9d62ca69f838")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


True

In [7]:
sweep_config = {
    # How to perform hyperparameter tuning
    "method": "Bayesian",
    # How to evaluate which hyperparameter combination is good
    "metric": {
        "name": "ents_f5",
        "goal": "maximize",
    },
    # Hyperparameters to tune
    "parameters": {
        # Hyperparameters that will change
        "o_weight": {"distribution": "uniform", "min": 0.2, "max": 0.35},
        "learning_rate": {"distribution": "uniform", "min": 2e-5, "max": 1e-4},
        "weight_decay": {"distribution": "uniform", "min": 0.02, "max": 0.08},
        "num_train_epochs": {"value": 3},
        "warmup_ratio": {"value": 0.1},
        # Hyperparameters that will not change
        "threshold": {"value": 0.95},
        "stride_artifact": {"value": f"{PROJECT_PATH}/processed_data:v0"},
        "raw_artifact": {"value": f"{PROJECT_PATH}/raw_data:v0"},
        "output_dir": {"value": f"model_dir/DeBERTA-V3-{MODEL_SIZE}-{MAX_LENGTH}"},
        "inference_max_length": {"value": 1024},
        "training_max_length": {"value": 1024},
        "training_model_path": {"value": f"microsoft/deberta-v3-{MODEL_SIZE}"},
        "fp16": {"value": True},
        "per_device_train_batch_size": {"value": 8},
        "per_device_eval_batch_size": {"value": 8},
        "evaluation_strategy": {"value": "no"},
        "do_eval": {"value": False},
        "save_total_limit": {"value": 1},
        "logging_steps": {"value": 10},
        "lr_scheduler_type": {"value": "cosine"},
        "random_state": {"value": 29},
        "gradient_accumulation_steps": {"value": 2},
    },
    # Early stopping
    # "early_terminate": {
    #     "type": "hyperband",
    #     "max_iter": 27,
    # }
}

In [8]:
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)

Create sweep with ID: pjbq0w1i
Sweep URL: https://wandb.ai/laplacesdemon43/Kaggle-PII/sweeps/pjbq0w1i


In [9]:
def get_data(config):
    stride_artifact = wandb.use_artifact(config.stride_artifact)
    stride_artifact_dir = stride_artifact.download()
    df = pd.read_parquet(stride_artifact_dir + "/stride_data.parquet")

    train_df = df[df.valid == False].reset_index(drop=True)
    eval_df = df[df.valid == True].reset_index(drop=True)

    negatives, positives = [], []

    for _, row in train_df.iterrows():
        if any(row.labels != "O"):
            positives.append(row)
        else:
            negatives.append(row)

    positives, negatives = pd.DataFrame(positives), pd.DataFrame(negatives)
    negatives = negatives.iloc[: negatives.shape[0] // 3]
    train_df = pd.concat([positives, negatives])
    train_df = train_df.sample(frac=1, random_state=config.random_state)

    reference_df = get_reference_df_parquet(config.raw_artifact)

    all_labels = sorted(list(set(chain(*[x.tolist() for x in df.labels.values]))))
    label2id = {l: i for i, l in enumerate(all_labels)}
    id2label = {v: k for k, v in label2id.items()}

    return train_df, eval_df, reference_df, all_labels, label2id, id2label

In [10]:
def get_tokenized_dataset(config, train_df, eval_df, label2id):
    tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
    train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id)
    valid_ds = create_dataset(eval_df, tokenizer, config.inference_max_length, label2id)
    return tokenizer, train_ds, valid_ds

In [11]:
def train(
    config,
    all_labels,
    id2label,
    label2id,
    tokenizer,
    train_ds,
    valid_ds,
    reference_df,
):
    model = AutoModelForTokenClassification.from_pretrained(
        config.training_model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    args = TrainingArguments(
        output_dir=config.output_dir,
        fp16=config.fp16,
        learning_rate=config.learning_rate,
        num_train_epochs=config.num_train_epochs,
        per_device_train_batch_size=config.per_device_train_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        evaluation_strategy=config.evaluation_strategy,
        do_eval=config.do_eval,
        save_total_limit=config.save_total_limit,
        logging_steps=config.logging_steps,
        lr_scheduler_type=config.lr_scheduler_type,
        warmup_ratio=config.warmup_ratio,
        weight_decay=config.weight_decay,
    )

    class_weights = torch.tensor([1.0] * 12 + [config.o_weight]).to("cuda")

    trainer = CustomTrainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=None,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=partial(
            compute_metrics,
            id2label=id2label,
            valid_ds=valid_ds,
            valid_df=reference_df,
            threshold=config.threshold,
        ),
        class_weights=class_weights,
    )

    trainer.train()

    return trainer

In [12]:
def calculate_valid_f5(config, trainer, id2label, valid_ds, reference_df):
    preds = trainer.predict(valid_ds)
    metric = compute_metrics(
        (preds.predictions, None),
        id2label,
        valid_ds,
        reference_df,
        config.threshold,
    )
    wandb.log(
        {
            "ents_f5": metric["ents_f5"],
            "ents_r": metric["ents_r"],
            "ents_p": metric["ents_p"],
        }
    )

In [13]:
def main(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        # Set random seed
        set_seed(config.random_state)
        torch.manual_seed(config.random_state)

        train_df, eval_df, reference_df, all_labels, label2id, id2label = get_data(
            config
        )

        tokenizer, train_ds, valid_ds = get_tokenized_dataset(
            config,
            train_df,
            eval_df,
            label2id,
        )

        trainer = train(
            config,
            all_labels,
            id2label,
            label2id,
            tokenizer,
            train_ds,
            valid_ds,
            reference_df,
        )

        calculate_valid_f5(config, trainer, id2label, valid_ds, reference_df)

In [14]:
wandb.agent(sweep_id, main, count=3)

[34m[1mwandb[0m: Agent Starting Run: vf3visoy with config:
[34m[1mwandb[0m: 	do_eval: False
[34m[1mwandb[0m: 	evaluation_strategy: no
[34m[1mwandb[0m: 	fp16: True
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	inference_max_length: 1024
[34m[1mwandb[0m: 	learning_rate: 0.0001949620073745712
[34m[1mwandb[0m: 	logging_steps: 10
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	num_train_epochs: 0.01
[34m[1mwandb[0m: 	o_weight: 0.5304613681790084
[34m[1mwandb[0m: 	output_dir: model_dir/DeBERTA-V3-base-1024
[34m[1mwandb[0m: 	per_device_eval_batch_size: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	random_state: 29
[34m[1mwandb[0m: 	raw_artifact: laplacesdemon43/Kaggle-PII/raw_data:v0
[34m[1mwandb[0m: 	save_total_limit: 1
[34m[1mwandb[0m: 	stride_artifact: laplacesdemon43/Kaggle-PII/processed_data:v0
[34m[1mwandb[0m: 	threshold: 0.95
[34m[1mwandb[0m: 	training_max_length: 1024


Traceback (most recent call last):
  File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
    if train_df is None:
       ^^^^^^^^
UnboundLocalError: cannot access local variable 'train_df' where it is not associated with a value


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run vf3visoy errored:
Traceback (most recent call last):
  File "/media/ishfar/New Volume/Studies/Projects/Kaggle/PII_Detection/venv/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
    if train_df is None:
       ^^^^^^^^
UnboundLocalError: cannot access local variable 'train_df' where it is not associated with a value

[34m[1mwandb[0m: [32m[41mERROR[0m Run vf3visoy errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/media/ishfar/New Volume/Studies/Projects/Kaggle/PII_Detection/venv/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     if train_df is None:
[34m[1mwan

Traceback (most recent call last):
  File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
    if train_df is None:
       ^^^^^^^^
UnboundLocalError: cannot access local variable 'train_df' where it is not associated with a value


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run mclp7a6c errored:
Traceback (most recent call last):
  File "/media/ishfar/New Volume/Studies/Projects/Kaggle/PII_Detection/venv/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
    if train_df is None:
       ^^^^^^^^
UnboundLocalError: cannot access local variable 'train_df' where it is not associated with a value

[34m[1mwandb[0m: [32m[41mERROR[0m Run mclp7a6c errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/media/ishfar/New Volume/Studies/Projects/Kaggle/PII_Detection/venv/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 308, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_9806/4283551679.py", line 8, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     if train_df is None:
[34m[1mwan

In [None]:
wandb.finish()