In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seqeval evaluate -q

In [None]:
import warnings
warnings.simplefilter('ignore')

from itertools import chain
from pathlib import Path
import json
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features

from sklearn.model_selection import StratifiedKFold
from functools import partial


In [None]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 2048
EVAL_MAX_LENGTH = 3072
CONF_THRESH = 0.9
LR = 5e-4  # Note: lr for LoRA should be order of magnitude larger than usual fine-tuning
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 4
BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 16 // BATCH_SIZE
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 6
LORA_R = 16  # rank of the A and B matricies, the lower the more efficient but more approximate
LORA_ALPHA = LORA_R * 2  # alpha/r is multiplied to BA
AMP = True
N_SPLITS = 4
NEGATIVE_RATIO = 0.3  # downsample ratio of negative samples in the training set
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [None]:
import warnings
warnings.simplefilter('ignore')

from itertools import chain
from pathlib import Path
import json
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features

from sklearn.model_selection import StratifiedKFold

In [None]:
from functools import partial

# ****DATA SELECTION and LABEL MAPPING****

In [None]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))
ext_data = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"))
ext_more = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"))
# data[3]

In [None]:
p=[]
n=[]
for d in data:
    if any(np.array(d["labels"]) != 'O'): p.append(d)
    else: n.append(d) 

print("The length of data: ", len(data))  
print("The number of positive samples is: ", len(p))
print("The number of negative samples is: ", len(n))

In [None]:
pex=[]
nex=[]
for d in ext_data:
    if any(np.array(d["labels"]) != 'O'): pex.append(d)
    else: nex.append(d) 

print("The length of data: ", len(ext_data))  
print("The number of positive samples is: ", len(pex))
print("The number of negative samples is: ", len(nex))


In [None]:
pexm=[]
nexm=[]
for d in ext_more:
    if any(np.array(d["labels"]) != 'O'): pexm.append(d)
    else: nexm.append(d) 

print("The length of data: ", len(ext_more))  
print("The number of positive samples is: ", len(pexm))
print("The number of negative samples is: ", len(nexm))

In [None]:
print("Total size of the data: ", len(data+ext_more+ext_data))
print("Out of which data with no positive label: ", len(n+nex+nexm))
print("Paragraphs with personal info: ", len(p+pex+pexm))

In [None]:
data = data + ext_data + ext_more
len(data)

In [None]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

# ****TOKENIZER FUNCTION****



In [None]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [None]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 3096
OUTPUT_DIR = "output"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
    "num_tokens": [len(x["tokens"]) for x in data],
    "len_label": [int(len(x["tokens"])/200) for x in data]

})
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": TRAINING_MAX_LENGTH}, num_proc=3)


In [None]:
count = 0
for c in ds["len_label"]:
    if c > 8:
        count+=1
        
print(count)       
ds = ds.filter(lambda x: x["len_label"]<=8)        

# ****COMPUTE METRICS****

In [None]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

In [None]:
batch_size = 8

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8 // batch_size,
    report_to="none",
    per_device_eval_batch_size = 4,
    evaluation_strategy="steps",
    eval_steps=50,
    eval_delay=100,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    do_eval=True,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
#     metric_for_best_model 
)

In [None]:
# args = TrainingArguments(
#     output_dir=OUTPUT_DIR,
#     fp16=AMP,
#     learning_rate=LR,
#     num_train_epochs=4,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=EVAL_BATCH_SIZE,
#     gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
#     report_to="none",
#     evaluation_strategy="steps",
#     eval_steps=50,
#     eval_delay=100,
#     save_strategy="steps",
#     save_steps=50,
#     save_total_limit=1,
#     logging_steps=10,
#     metric_for_best_model="f1",
#     greater_is_better=True,
#     load_best_model_at_end=True,
#     overwrite_output_dir=True,
#     lr_scheduler_type=LR_SCHEDULER_TYPE,
#     warmup_ratio=WARMUP_RATIO,
#     weight_decay=WEIGHT_DECAY,
# )


In [None]:
skf = StratifiedKFold(n_splits=5)
fold_idx = 1

# split = []
splits = skf.split(ds, ds["len_label"])
for train_index, test_index in skf.split(ds, ds["len_label"]):
    # Create train and test datasets for this split
    train_ds = ds.select(train_index)
    test_ds = ds.select(test_index)
    
    
    model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
    )
    
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    
    
    trainer = Trainer(
    model=model, 
    args=args, 
    eval_dataset = test_ds,    
    train_dataset= train_ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)
    
#     %%time
    trainer.train()
    eval_res = trainer.evaluate(eval_dataset=test_ds)
    with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
        json.dump(eval_res, f)
    trainer.model = trainer.model.base_model.merge_and_unload()
    trainer.save_model(os.path.join(OUTPUT_DIR, f"fold_{fold_idx}", "best"))    
    del trainer
    gc.collect()
    torch.cuda.empty_cache()
    fold_idx += 1
    
#     split.append((train_ds, test_ds))

In [None]:
# trainer.save_model("deberta3base")
# tokenizer.save_pretrained("deberta3base")