In [1]:
import os
os.chdir("../")

# 🏋️ PII Model Training Notebook

## 📦 Imports

From Packages

In [2]:
from itertools import chain
from functools import partial
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
)
import pandas as pd
from types import SimpleNamespace
import torch
import wandb
import spacy
from concurrent.futures import as_completed, ProcessPoolExecutor

From utility scripts

In [3]:
from src.metric import compute_metrics
from src.data import create_dataset
from src.utils import (
    get_reference_df_parquet,
    parse_predictions,
    filter_errors,
    generate_htmls_concurrently,
    visualize,
    convert_for_upload,
    CustomTrainer,
    parse_args,
)

## 🆕 Initialization

In [4]:
FIRST_PART = "first"
LAST_PART = "last"
MIDDLE_PART = "middle"
PART = LAST_PART

In [5]:
MODEL_SIZE = "base"

In [6]:
MAX_LENGTH = 512
WANDB_PROJECT = "Kaggle-PII"
USER_NAME = "shakleenishfar"
PROJECT_PATH = f"laplacesdemon43/{WANDB_PROJECT}"
EXPERIMENT = f"pii014_{PART}"
WANDB_NOTEBOOK_NAME = "pii-model-training.ipynb"
WANDB_NAME=f"DeBERTA-v3-{MODEL_SIZE}-{MAX_LENGTH}-{PART}"
WANDB_NOTES=f"""Training using DeBERTA-v3-{MODEL_SIZE}-{MAX_LENGTH} {PART} one-third negative samples. 
Included data from Valentin, Moth, NBroad, MPWare, Dileep, Newton, PJ Mathematician, and No fit just luck."""

In [7]:
config = SimpleNamespace(
    experiment=EXPERIMENT,
    threshold=0.95,
    o_weight=0.2,
    stride_artifact=f"{PROJECT_PATH}/processed_data:latest",
    raw_artifact=f"{PROJECT_PATH}/raw_data:latest",
    external_data_1="none",
    external_data_2="none",
    external_data_3="none",
    external_data_4="none",
    external_data_5="none",
    output_dir=f"model_dir/DeBERTA-V3-{MODEL_SIZE}-{MAX_LENGTH}-{PART}",
    inference_max_length=MAX_LENGTH,
    training_max_length=MAX_LENGTH,
    training_model_path=f"microsoft/deberta-v3-{MODEL_SIZE}",
    fp16=True,
    learning_rate=4e-5,
    num_train_epochs=5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=2,
    report_to="wandb",
    evaluation_strategy="epoch",
    do_eval=True,
    save_total_limit=1,
    logging_steps=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    random_state=29,
)

In [8]:
wandb.login(key="0bf204609ea345c7c595565d736a9d62ca69f838")
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    notes=WANDB_NOTES,
    save_code=True,
    job_type="train",
    config=config,
)
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


In [9]:
torch.manual_seed(config.random_state)

<torch._C.Generator at 0x7f30c4ff0b10>

## 💾 Data Preparation

### Fetching Data

Getting data from Weights and Biases

In [10]:
stride_artifact = wandb.use_artifact(config.stride_artifact)
stride_artifact_dir = stride_artifact.download()
df = pd.read_parquet(stride_artifact_dir + "/stride_data.parquet")

[34m[1mwandb[0m:   1 of 1 files downloaded.  


### Splitting Data

Into train and evaluation splits.

In [11]:
train_df = df[df.valid == False].reset_index(drop=True)
eval_df = df[df.valid == True].reset_index(drop=True)

print("Size of training dataset:", train_df.shape[0])
print("Size of validation dataset:", eval_df.shape[0])

Size of training dataset: 29462
Size of validation dataset: 3763


### Negative Sampling

Used to handle the extreme class imbalance in the data. Suggested by Valentin Warner.

* positive samples (contain relevant labels)

* negative samples (presumably contain entities that are possibly wrongly classified as entity)

In [12]:
negatives, positives = [], []

for _, row in train_df.iterrows():
    if any(row.labels != "O"):
        positives.append(row)
    else:
        negatives.append(row)
        
positives, negatives = pd.DataFrame(positives), pd.DataFrame(negatives)
print("Negative samples:", len(negatives))
print("Positive samples:", len(positives))

Negative samples: 20514
Positive samples: 8948


Take one third of the negative samples for downsampling.

In [13]:
if PART == FIRST_PART:
    negatives = negatives.iloc[: negatives.shape[0] // 3]
elif PART == MIDDLE_PART:
    negatives = negatives.iloc[negatives.shape[0] // 3 : 2 * negatives.shape[0] // 3]
elif PART == LAST_PART:
    negatives = negatives.iloc[2 * negatives.shape[0] // 3 :]
else:
    raise Exception(f"Undefined part: {PART}")

train_df = pd.concat([positives, negatives])
train_df = train_df.sample(frac=1, random_state=config.random_state)
print(f"Down sampled training: {len(train_df)}")
del positives, negatives

Down sampled training: 15786


### 🪙 Data Tokenization

In [14]:
reference_df = get_reference_df_parquet(config.raw_artifact)
all_labels = sorted(list(set(chain(*[x.tolist() for x in df.labels.values]))))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}
id2label

[34m[1mwandb[0m:   1 of 1 files downloaded.  


{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id)
valid_ds = create_dataset(eval_df, tokenizer, config.inference_max_length, label2id)



Map (num_proc=6):   0%|          | 0/15786 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/3763 [00:00<?, ? examples/s]

## 🏋️ Training

In [16]:
model = AutoModelForTokenClassification.from_pretrained(
    config.training_model_path,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### PEFT (Parameter Efficient Finetuning)

In [17]:
# import peft
# from peft import (
#     get_peft_config,
#     PeftModel,
#     PeftConfig,
#     get_peft_model,
#     LoraConfig,
#     TaskType,
# )

In [18]:
# peft_config = LoraConfig(
#     r=128,  # Use larger 'r' value increase more parameters during training
#     bias='none',
#     inference_mode=False,
#     task_type=TaskType.SEQ_CLS,
#     # Only Use Output and Values Projection
#     target_modules=['query_proj', 'value_proj'],
# )

# # Load the PEFT model
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

In [19]:
args = TrainingArguments(
    output_dir=config.output_dir,
    fp16=config.fp16,
    learning_rate=config.learning_rate,
    num_train_epochs=config.num_train_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    report_to=config.report_to,
    evaluation_strategy=config.evaluation_strategy,
    do_eval=config.do_eval,
    save_total_limit=config.save_total_limit,
    logging_steps=config.logging_steps,
    lr_scheduler_type=config.lr_scheduler_type,
    warmup_ratio=config.warmup_ratio,
    weight_decay=config.weight_decay,
)

Set "O" tokens to have a very small weight.

In [20]:
class_weights = torch.tensor([1.0] * 12 + [config.o_weight]).to("cuda")

In [21]:
trainer = CustomTrainer(
    model=model, 
    args=args, 
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(
        compute_metrics,
        id2label=id2label,
        valid_ds=valid_ds,
        valid_df=reference_df,
        threshold=config.threshold,
    ),
    class_weights=class_weights
)

In [22]:
trainer.train()

  0%|          | 0/1645 [00:00<?, ?it/s]

{'loss': 2.8179, 'grad_norm': 23.862031936645508, 'learning_rate': 1.6969696969696973e-06, 'epoch': 0.03}
{'loss': 2.6269, 'grad_norm': 22.375364303588867, 'learning_rate': 4.1212121212121215e-06, 'epoch': 0.06}
{'loss': 1.8583, 'grad_norm': 18.716312408447266, 'learning_rate': 6.545454545454546e-06, 'epoch': 0.09}
{'loss': 0.8768, 'grad_norm': 3.011992931365967, 'learning_rate': 8.969696969696971e-06, 'epoch': 0.12}
{'loss': 0.3988, 'grad_norm': 0.7554126977920532, 'learning_rate': 1.1393939393939395e-05, 'epoch': 0.15}
{'loss': 0.2485, 'grad_norm': 0.8025091886520386, 'learning_rate': 1.381818181818182e-05, 'epoch': 0.18}
{'loss': 0.1463, 'grad_norm': 0.40720948576927185, 'learning_rate': 1.6242424242424243e-05, 'epoch': 0.21}
{'loss': 0.0927, 'grad_norm': 0.3295033276081085, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.24}
{'loss': 0.0566, 'grad_norm': 0.28712305426597595, 'learning_rate': 2.109090909090909e-05, 'epoch': 0.27}
{'loss': 0.0372, 'grad_norm': 0.37031373381614685,

  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 0.006895515602082014, 'eval_ents_p': 0.2882069795427196, 'eval_ents_r': 0.958, 'eval_ents_f5': 0.8793955655980793, 'eval_ents_per_type_EMAIL_p': 0.39655172413793105, 'eval_ents_per_type_EMAIL_r': 1.0, 'eval_ents_per_type_EMAIL_f5': 0.9447077409162716, 'eval_ents_per_type_ID_NUM_p': 0.13071895424836602, 'eval_ents_per_type_ID_NUM_r': 0.6666666666666666, 'eval_ents_per_type_ID_NUM_f5': 0.575858250276855, 'eval_ents_per_type_NAME_STUDENT_p': 0.33493282149712095, 'eval_ents_per_type_NAME_STUDENT_r': 0.9775910364145658, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9104043343032006, 'eval_ents_per_type_PHONE_NUM_p': 0.25925925925925924, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.900990099009901, 'eval_ents_per_type_STREET_ADDRESS_p': 0.18181818181818182, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.7878787878787878, 'eval_ents_per_type_URL_PERSONAL_p': 0.2, 'eval_ents_per_type_URL_PERSONAL_r': 0

  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 0.0026919199153780937, 'eval_ents_p': 0.5366922234392114, 'eval_ents_r': 0.98, 'eval_ents_f5': 0.9498247968388875, 'eval_ents_per_type_EMAIL_p': 0.8846153846153846, 'eval_ents_per_type_EMAIL_r': 1.0, 'eval_ents_per_type_EMAIL_f5': 0.9950083194675542, 'eval_ents_per_type_ID_NUM_p': 0.6666666666666666, 'eval_ents_per_type_ID_NUM_r': 0.9333333333333333, 'eval_ents_per_type_ID_NUM_f5': 0.9191919191919192, 'eval_ents_per_type_NAME_STUDENT_p': 0.5007112375533428, 'eval_ents_per_type_NAME_STUDENT_r': 0.9859943977591037, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9505608641462402, 'eval_ents_per_type_PHONE_NUM_p': 0.6363636363636364, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9784946236559142, 'eval_ents_per_type_STREET_ADDRESS_p': 0.5555555555555556, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.8873720136518771, 'eval_ents_per_type_URL_PERSONAL_p': 0.625, 'eval_ents_per_type_URL_PERSONAL_r': 0.

  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 0.00299506401643157, 'eval_ents_p': 0.48841893252769386, 'eval_ents_r': 0.97, 'eval_ents_f5': 0.9345586600459497, 'eval_ents_per_type_EMAIL_p': 0.7666666666666667, 'eval_ents_per_type_EMAIL_r': 1.0, 'eval_ents_per_type_EMAIL_f5': 0.9884297520661156, 'eval_ents_per_type_ID_NUM_p': 0.4393939393939394, 'eval_ents_per_type_ID_NUM_r': 0.9666666666666667, 'eval_ents_per_type_ID_NUM_f5': 0.9240196078431373, 'eval_ents_per_type_NAME_STUDENT_p': 0.502177068214804, 'eval_ents_per_type_NAME_STUDENT_r': 0.969187675070028, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9357187434990638, 'eval_ents_per_type_PHONE_NUM_p': 0.6363636363636364, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9784946236559142, 'eval_ents_per_type_STREET_ADDRESS_p': 0.273972602739726, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.8346709470304976, 'eval_ents_per_type_URL_PERSONAL_p': 0.49382716049382713, 'eval_ents_per_type_URL_PERSO

  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 0.0019299519481137395, 'eval_ents_p': 0.6342105263157894, 'eval_ents_r': 0.964, 'eval_ents_f5': 0.9450980392156864, 'eval_ents_per_type_EMAIL_p': 0.8846153846153846, 'eval_ents_per_type_EMAIL_r': 1.0, 'eval_ents_per_type_EMAIL_f5': 0.9950083194675542, 'eval_ents_per_type_ID_NUM_p': 0.7777777777777778, 'eval_ents_per_type_ID_NUM_r': 0.9333333333333333, 'eval_ents_per_type_ID_NUM_f5': 0.9262086513994909, 'eval_ents_per_type_NAME_STUDENT_p': 0.6265938069216758, 'eval_ents_per_type_NAME_STUDENT_r': 0.9635854341736695, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9440574203082119, 'eval_ents_per_type_PHONE_NUM_p': 0.6363636363636364, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9784946236559142, 'eval_ents_per_type_STREET_ADDRESS_p': 0.46511627906976744, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.8768971332209108, 'eval_ents_per_type_URL_PERSONAL_p': 0.6666666666666666, 'eval_ents_per_type_URL_

  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 0.0018289568834006786, 'eval_ents_p': 0.6616438356164384, 'eval_ents_r': 0.966, 'eval_ents_f5': 0.9492063492063492, 'eval_ents_per_type_EMAIL_p': 0.8846153846153846, 'eval_ents_per_type_EMAIL_r': 1.0, 'eval_ents_per_type_EMAIL_f5': 0.9950083194675542, 'eval_ents_per_type_ID_NUM_p': 0.8, 'eval_ents_per_type_ID_NUM_r': 0.9333333333333333, 'eval_ents_per_type_ID_NUM_f5': 0.9273885350318471, 'eval_ents_per_type_NAME_STUDENT_p': 0.6448598130841121, 'eval_ents_per_type_NAME_STUDENT_r': 0.9663865546218487, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9482029598308667, 'eval_ents_per_type_PHONE_NUM_p': 0.7, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9837837837837837, 'eval_ents_per_type_STREET_ADDRESS_p': 0.5405405405405406, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.8858603066439523, 'eval_ents_per_type_URL_PERSONAL_p': 0.7142857142857143, 'eval_ents_per_type_URL_PERSONAL_r': 0.975609756097561,

TrainOutput(global_step=1645, training_loss=0.05786551906229039, metrics={'train_runtime': 1867.8396, 'train_samples_per_second': 42.257, 'train_steps_per_second': 0.881, 'train_loss': 0.05786551906229039, 'epoch': 5.0})

### Saving Model and Metrics locally

In [23]:
trainer.save_model(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

('model_dir/DeBERTA-V3-base-512-last/tokenizer_config.json',
 'model_dir/DeBERTA-V3-base-512-last/special_tokens_map.json',
 'model_dir/DeBERTA-V3-base-512-last/spm.model',
 'model_dir/DeBERTA-V3-base-512-last/added_tokens.json',
 'model_dir/DeBERTA-V3-base-512-last/tokenizer.json')

## Determine Best Threshold

In [24]:
del tokenizer, model, collator, args, trainer

In [25]:
tokenizer = AutoTokenizer.from_pretrained(config.output_dir)
model = AutoModelForTokenClassification.from_pretrained(config.output_dir)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    report_to="none",
)
trainer = CustomTrainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)
preds = trainer.predict(valid_ds)

  0%|          | 0/471 [00:00<?, ?it/s]

In [26]:
print("Computing final metrics...")

thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.97, 0.99]
final_metrics = {}

with ProcessPoolExecutor() as executor:
    futures = [
        executor.submit(
            compute_metrics,
            (preds.predictions, None),
            id2label,
            valid_ds,
            reference_df,
            threshold=threshold,
        )
        for threshold in thresholds
    ]

    for future in as_completed(futures):
        threshold, metric = future.result()
        final_metrics[f"final_f5_at_{threshold}"] = metric["ents_f5"]

# final_metrics = {
#     f"final_f5_at_{threshold}": eval_compute_metrics(
#         (preds.predictions, None),
#         id2label,
#         valid_ds,
#         reference_df,
#         threshold=threshold,
#     )["ents_f5"]
#     for threshold in [0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 0.99]
# }
wandb.log(final_metrics)
print(final_metrics)

Computing final metrics...


ValueError: too many values to unpack (expected 2)

In [None]:
# pick the best threshold from the final metrics and use it to generate preds_df
best_threshold = float(max(final_metrics, key=final_metrics.get).split("_")[-1])
print("best_threshold:", best_threshold)
wandb.config.best_threshold = best_threshold
preds_df = parse_predictions(
    preds.predictions, id2label, valid_ds, threshold=best_threshold
)

best_threshold: 0.9


## 📊 Data Visualization

In [None]:
# Prepare data to visualize errors and log them as a Weights & Biases table
print("Visualizing errors...")
grouped_preds = preds_df.groupby("eval_row")[
    ["document", "token", "label", "token_str"]
].agg(list)
viz_df = pd.merge(
    eval_df.reset_index(),
    grouped_preds,
    how="left",
    left_on="index",
    right_on="eval_row",
)
viz_df = filter_errors(viz_df, preds_df)
viz_df["pred_viz"] = generate_htmls_concurrently(
    viz_df,
    tokenizer,
    preds.predictions,
    id2label,
    valid_ds,
    threshold=best_threshold,
)
nlp = spacy.blank("en")
htmls = [visualize(row, nlp) for _, row in viz_df.iterrows()]
wandb_htmls = [wandb.Html(html) for html in htmls]
viz_df["gt_viz"] = wandb_htmls
viz_df.fillna("", inplace=True)
viz_df = convert_for_upload(viz_df)
errors_table = wandb.Table(dataframe=viz_df)
wandb.log({"errors_table": errors_table})

print("Experiment finished, test it out on the inference notebook!")

Visualizing errors...


  0%|          | 0/75 [00:00<?, ?it/s]



Experiment finished, test it out on the inference notebook!


In [None]:
wandb.finish()

VBox(children=(Label(value='1.085 MB of 7.357 MB uploaded\r'), FloatProgress(value=0.14746467611540717, max=1.…



0,1
eval/ents_f5,▁▃▆██
eval/ents_p,▃▁▃▇█
eval/ents_per_type_EMAIL_f5,▄█▁▂▅
eval/ents_per_type_EMAIL_p,▃█▁▂▅
eval/ents_per_type_EMAIL_r,▁▁▁▁▁
eval/ents_per_type_ID_NUM_f5,▁▄▆█▆
eval/ents_per_type_ID_NUM_p,▁▄▆▇█
eval/ents_per_type_ID_NUM_r,▅▁▅█▅
eval/ents_per_type_NAME_STUDENT_f5,▁▃▅██
eval/ents_per_type_NAME_STUDENT_p,▄▁▂██

0,1
eval/ents_f5,0.9634
eval/ents_p,0.6768
eval/ents_per_type_EMAIL_f5,0.99501
eval/ents_per_type_EMAIL_p,0.88462
eval/ents_per_type_EMAIL_r,1.0
eval/ents_per_type_ID_NUM_f5,0.93333
eval/ents_per_type_ID_NUM_p,0.93333
eval/ents_per_type_ID_NUM_r,0.93333
eval/ents_per_type_NAME_STUDENT_f5,0.96418
eval/ents_per_type_NAME_STUDENT_p,0.65
