In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [2]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from loguru import logger

In [3]:
def get_devices() -> list:
    """
    Returns a list of available torch devices.
    Prioritizes CUDA (GPU) if available, followed by MPS (Apple Silicon), 
    and defaults to CPU if neither are available.
    """
    devices = []
    
    if torch.cuda.is_available():
        # Add all available CUDA devices
        for i in range(torch.cuda.device_count()):
            device = torch.device(f"cuda:{i}")
            devices.append(device)
            logger.info(f"Using CUDA device: {torch.cuda.get_device_name(i)} (cuda:{i})")
    
    elif torch.backends.mps.is_available():
        # If CUDA is not available, add MPS device (Apple Silicon)
        device = torch.device("mps")
        devices.append(device)
        logger.info("Using MPS (Apple Silicon) device.")
    
    else:
        # If neither CUDA nor MPS are available, default to CPU
        device = torch.device("cpu")
        devices.append(device)
        logger.info("Using CPU device.")

    return devices

In [4]:
class Config:
    # Model Config
    model_id = "microsoft/deberta-v3-base" # "microsoft/deberta-v3-large"
    model_architecture_config = AutoConfig.from_pretrained(
        model_id, output_hidden_states=True
    )

    # Training Config
    batch_size = 4
    max_length = 512
    num_workers = 2
    learning_rate = 5e-3
    epochs = 25
    
    # Hardware Config
    torch_device = get_devices()

    # Dataset
    dataset_file_path = (
        "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    )
    split_config = {
        "test_size": 0.2,
        "shuffle": True,
        "random_state": 10,
    }
    sample_only = False
    sample_size = 64

    # Labels:
    label2id = {
        "O": 0,
        "B-EMAIL": 1,
        "B-ID_NUM": 2,
        "B-NAME_STUDENT": 3,
        "B-PHONE_NUM": 4,
        "B-STREET_ADDRESS": 5,
        "B-URL_PERSONAL": 6,
        "B-USERNAME": 7,
        "I-ID_NUM": 8,
        "I-NAME_STUDENT": 9,
        "I-PHONE_NUM": 10,
        "I-STREET_ADDRESS": 11,
        "I-URL_PERSONAL": 12,
    }
    id2label = {
        "0": "O",
        "1": "B-EMAIL",
        "2": "B-ID_NUM",
        "3": "B-NAME_STUDENT",
        "4": "B-PHONE_NUM",
        "5": "B-STREET_ADDRESS",
        "6": "B-URL_PERSONAL",
        "7": "B-USERNAME",
        "8": "I-ID_NUM",
        "9": "I-NAME_STUDENT",
        "10": "I-PHONE_NUM",
        "11": "I-STREET_ADDRESS",
        "12": "I-URL_PERSONAL",
    }
    num_labels = len(label2id)


print("torch_device: ", Config.torch_device)

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

[32m2024-09-22 10:54:04.899[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_devices[0m:[36m14[0m - [1mUsing CUDA device: Tesla P100-PCIE-16GB (cuda:0)[0m


torch_device:  [device(type='cuda', index=0)]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    Config.model_id,
    use_fast=True,  # to avoid warnings
    clean_up_tokenization_spaces=False,  # to avoid warnings
    max_length=Config.max_length,
)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
df = pd.read_json(Config.dataset_file_path)

if Config.sample_only:
    df = df[0 : Config.sample_size]

df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [7]:
def replace_space(tokens_list):
    return ["[SPACE]" if x.isspace() else x for x in tokens_list]


def get_tokenized_tokens_length(text):
    return len(
        tokenizer(text, return_attention_mask=False, return_token_type_ids=False)[
            "input_ids"
        ]
    )


def data_preprocessing(df):
    df["tokens"] = df["tokens"].apply(replace_space)

    df["tokenized_tokens_length"] = df["full_text"].apply(
        lambda text: get_tokenized_tokens_length(text)
    )
    df = df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(
        drop=True
    )

    return df


df = data_preprocessing(df=df)
print(f"Original Dataset Shape: {df.shape}")
df = df[df["tokenized_tokens_length"]<Config.max_length]
print(f"Original Dataset Shape: {df.shape}")

import plotly.express as px
fig = px.histogram(x=df["tokenized_tokens_length"])
fig.show()

Original Dataset Shape: (6807, 6)
Original Dataset Shape: (1884, 6)


In [8]:
if not Config.sample_only:
    Config.split_config["stratify"] = pd.cut(
        df["tokenized_tokens_length"], bins=10, labels=False
    )

train_df, test_df = train_test_split(df, **Config.split_config)
train_df.reset_index(inplace=True)
train_df = train_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df = test_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df.reset_index(inplace=True)
train_df.shape, test_df.shape

((1507, 7), (377, 7))

In [9]:
def prepare_input(row, tokenizer):
    processed_text_tokens_list = []
    char_map = []
    label_char_map = {}

    for index in range(len(row["tokens"])):
        token = unidecode(row["tokens"][index])

        whitespace = row["trailing_whitespace"][index]
        label = row["labels"][index]

        processed_text_tokens_list.append(token)
        char_map.extend([index] * len(token))

        label_char_map[index] = label

        if whitespace:
            processed_text_tokens_list.append(" ")
            char_map.append(-1)

    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'char_map'.
    processed_text = "".join(processed_text_tokens_list)
    tokenized = tokenizer(
        processed_text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=Config.max_length,
    )

    length = len(tokenized.input_ids)

    tokenized_info = {
        **tokenized,
        "processed_text": processed_text,
        "length": length,
        "char_map": char_map,  # Now includes mapping to original tokens
        "label_char_map": label_char_map,
    }
    return tokenized_info


def get_labels(tokenized_info):
    label_list = []
    offset_map = tokenized_info["offset_mapping"]
    for index, offset_map_item in enumerate(offset_map):
        if offset_map_item == (0, 0):
            label_list.extend(["Start_End"])
            continue

        char_map_item = tokenized_info["char_map"][
            offset_map_item[0] : offset_map_item[1]
        ]
        char_map_item_filtered = [element for element in char_map_item if element != -1]

        label_item = set(
            [
                tokenized_info["label_char_map"][element]
                for element in char_map_item_filtered
            ]
        )

        if len(label_item) != 1:
            if tokenized_info["input_ids"][index] in [507]:
                label_item = "O"

            else:
                raise Exception(
                    "\n"
                    f"Token ID: {tokenized_info['input_ids'][index]}\n"
                    f"Token: {tokenizer.decode(tokenized_info['input_ids'][index])}\n"
                    f"Offset: {offset_map_item}\n"
                    f"Text: {tokenized_info['processed_text'][ offset_map_item[0] : offset_map_item[1] ]}\n"
                    f"Character Map: {char_map_item}\n"
                    f"Filtered Character Map {char_map_item_filtered}\n"
                    f"Labels: {label_item}"
                )

        label_list.extend(list(label_item))

    if len(label_list) != len(tokenized_info["input_ids"]):
        raise Exception("Error: Size of label_list and input_ids are not same.")
    return label_list

In [10]:
# Check - Test - Dataset
for index in tqdm(train_df.index):
    tokenized_info = prepare_input(train_df.iloc[index], tokenizer)
    label_item = get_labels(tokenized_info)

print("Awesome - Everything is fine")

100%|██████████| 1507/1507 [00:13<00:00, 112.23it/s]

Awesome - Everything is fine





In [11]:
class PII_Dataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index].to_dict()
        row.pop("tokenized_tokens_length")

        tokenized_info = prepare_input(self.df.iloc[index], tokenizer)
        label_item = get_labels(tokenized_info)
        tokenized_info["document"] = row.pop("document")
        tokenized_info["labels_tokens"] = label_item
        tokenized_info["labels"] = [
            0 if item == "Start_End" else Config.label2id[item] for item in label_item
        ]

        if len(tokenized_info["labels"]) != len(tokenized_info["input_ids"]):
            raise Exception(
                f"Error in tokenized_info - length of lavel_ids and input_ids are not same: {tokenized_info}"
            )
            
        return tokenized_info

In [12]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # List of keys to extract from each sample
        keys = [
#            "document",
            "input_ids",
            "token_type_ids",
            "attention_mask",
#             "offset_mapping",
#             "processed_text",
#             "length",
#             "char_map",
#             "label_char_map",
#             "labels_tokens",
            "labels",
        ]

        # Populate the output dictionary using a loop
        output = {key: [sample[key] for sample in batch] for key in keys}

        batch_max = max([len(ids) for ids in output["input_ids"]])

        # Add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [
                list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id]
                for s in output["input_ids"]
            ]
            output["attention_mask"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]
            ]
            output["token_type_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]
            ]
#             output["offset_mapping"] = [
#                 list(s) + (batch_max - len(s)) * [(0, 0)]
#                 for s in output["offset_mapping"]
#             ]
            output["labels"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["labels"]
            ]

        # Convert to tensors and move to the specified device
        keys = ["input_ids", "attention_mask", "token_type_ids", "labels"]
        for key in keys:
            output[key] = torch.tensor(output[key], dtype=torch.long) # .to(Config.torch_device[0])

        return output

In [13]:
train_dataset = PII_Dataset(tokenizer, df=train_df)
test_dataset = PII_Dataset(tokenizer, df=test_df)

In [14]:
Config.model_architecture_config.num_labels = Config.num_labels

# Source Code:- https://github.dev/huggingface/transformers/blob/main/src/transformers/models/deberta_v2/modeling_deberta_v2.py
model = AutoModelForTokenClassification.from_config(Config.model_architecture_config)
model.gradient_checkpointing_enable()
model

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [15]:
# # First GPU
# model.deberta.embeddings.to(Config.torch_device[0])
# model.deberta.encoder.layer[:5].to(Config.torch_device[0])

# # Second GPU
# model.deberta.encoder.layer[5:].to(Config.torch_device[1])
# model.dropout.to(Config.torch_device[1])
# model.classifier.to(Config.torch_device[1])
# model.deberta.encoder.rel_embeddings.to(Config.torch_device[1])
# model.deberta.encoder.LayerNorm.to(Config.torch_device[1])

In [16]:
def count_trainable_parameters(model: torch.nn.Module) -> int:
    """
    Count the number of trainable parameters in a PyTorch model.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def estimate_parameter_memory(model: torch.nn.Module, dtype=torch.float32) -> float:
    """
    Estimate the memory required to store the parameters of a PyTorch model. The estimated memory in megabytes (MB).
    """
    num_params = count_trainable_parameters(model)
    bytes_per_element = torch.finfo(dtype).bits // 8
    total_memory_bytes = num_params * bytes_per_element
    total_memory_mb = total_memory_bytes / (1024 ** 2)
    return num_params, total_memory_mb


num_params, total_memory_mb = estimate_parameter_memory(model)
print(f"Number of trainable parameters in Model: {num_params}")
print(f"Estimated memory for Model parameters: {total_memory_mb:.3f} MB\n")

Number of trainable parameters in Model: 183841549
Estimated memory for Model parameters: 701.300 MB



In [17]:
# item = train_dataset.__getitem__(0)

# model.eval()

# # Checking models forward pass
# model_output = model.forward(
#     input_ids=item["input_ids"].to(Config.torch_device[0]),
#     attention_mask=item["attention_mask"].to(Config.torch_device[0]),
#     token_type_ids=item["token_type_ids"].to(Config.torch_device[0]),
#     labels=item["label_ids"].to(Config.torch_device[0])
# )

# model_output

In [18]:
# def evaluate_model(model, dataloader, device):
#     model.eval()
#     total_eval_loss = 0
#     all_preds = []
#     all_labels = []
    
#     progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

#     with torch.no_grad():
#         for batch in progress_bar:
#             input_ids = batch["input_ids"].to(device[0])
#             attention_mask = batch["attention_mask"].to(device[0])
#             token_type_ids = batch["token_type_ids"].to(device[0])
#             labels = batch["labels"].to(device[0])

#             output = model(input_ids, attention_mask, token_type_ids, labels)
#             loss = output.loss
#             total_eval_loss += loss.item()
            
#             predicted_labels = convert_logits_to_labels(logits=logits)
#             all_preds.extend(predicted_labels.cpu().numpy().flatten())
#             all_labels.extend(labels.cpu().numpy().flatten())

#     avg_eval_loss = total_eval_loss / len(dataloader)
#     print(f"Validation Loss: {avg_eval_loss:.4f}")

#     print("Classification Report:")
#     print(classification_report(all_labels, all_preds, zero_division=0))

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import fbeta_score
import numpy as np

# Training Arguments
## Docs: https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments.overwrite_output_dir

output_dir = "./deberta_token_classification_model"
logging_dir = './logs'


training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=Config.learning_rate,
    per_device_train_batch_size=Config.batch_size,
    per_device_eval_batch_size=Config.batch_size,
    num_train_epochs=Config.epochs,
    gradient_accumulation_steps=1,
    torch_empty_cache_steps=1,
    # weight_decay=0.01, # check other AdamW's hpyerparameters
    # lr_scheduler_type
    logging_dir=logging_dir,
    logging_steps=4,
    load_best_model_at_end=True,
    fp16 = True,
    fp16_opt_level = "O1",
    fp16_full_eval=True,
    # label_names 
    optim = "adamw_torch",
    metric_for_best_model="eval_loss",
    report_to="none",  # Disable reporting to avoid external integrations
    # activation_checkpointing = True # (Default is False)
    eval_do_concat_batches = False # (Default is True)
)


# Define compute_metrics function for evaluation
def compute_metrics(p):
    predictions, labels = p
    labels = labels[0]
    predictions = predictions[0][0].argmax(axis=-1)
    fbeta_score_pred_filter_list = []
    fbeta_score_true_filter_list = []
    fbeta_score_no_filter_list = [] 
        
    for index in range(len(predictions)):
        predicted_single = predictions[index]
        labels_single = labels[index]
        
        # Inlcuding 0
        fbeta_score_single = fbeta_score(
            y_true=labels_single, y_pred=predicted_single, beta=5, average="micro", zero_division=0.0
        )
        fbeta_score_no_filter_list.extend([fbeta_score_single])
        
        
        # Remove 0 based on predicted labels
        sub_predicted = []
        sub_labels = []
        for sub_index in range(len(predicted_single)):
            if predicted_single[sub_index] != 0:
                sub_predicted.extend([predicted_single[sub_index]])
                sub_labels.extend([labels_single[sub_index]])
                
        fbeta_score_single = fbeta_score(
            y_true=sub_labels, y_pred=sub_predicted, beta=5, average="micro", zero_division=0.0
        )
        
        fbeta_score_pred_filter_list.extend([fbeta_score_single])
        
        
        # Remove 0 based on true labels
        sub_predicted = []
        sub_labels = []
        for sub_index in range(len(predicted_single)):
            if labels_single[sub_index] != 0:
                sub_predicted.extend([predicted_single[sub_index]])
                sub_labels.extend([labels_single[sub_index]])
                
        fbeta_score_true_filter_list = fbeta_score(
            y_true=sub_labels, y_pred=sub_predicted, beta=5, average="micro", zero_division=0.0
        )
        
        fbeta_score_pred_filter_list.extend([fbeta_score_single])    
        
    return {
        "fbeta_score_pred_filter": np.mean(fbeta_score_pred_filter_list),
        "fbeta_score_true_filter": np.mean(fbeta_score_true_filter_list),
        "fbeta_score_no_filter": np.mean(fbeta_score_no_filter_list)
    }

In [20]:
# Initialize Trainer
logger.info("Initializing Trainer")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=Collate(tokenizer),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Start training
logger.info("Starting training")
trainer.train()

[32m2024-09-22 10:54:57.359[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mInitializing Trainer[0m
[32m2024-09-22 10:54:57.724[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mStarting training[0m


Epoch,Training Loss,Validation Loss,Fbeta Score Pred Filter,Fbeta Score True Filter,Fbeta Score No Filter
1,0.0407,0.014221,0.0,0.0,1.0
2,0.0,0.024372,0.0,0.0,1.0
3,0.0301,0.023845,0.0,0.0,1.0
4,0.0075,0.034432,0.0,0.0,1.0
5,0.0073,0.019162,0.0,0.0,1.0
6,0.0535,0.020148,0.0,0.0,1.0
7,0.0044,0.024447,0.0,0.0,1.0
8,0.0243,0.032072,0.0,0.0,1.0
9,0.0009,0.030492,0.0,0.0,1.0
10,0.0169,0.035911,0.0,0.0,1.0


TrainOutput(global_step=9425, training_loss=0.037913934395197066, metrics={'train_runtime': 5742.9801, 'train_samples_per_second': 6.56, 'train_steps_per_second': 1.641, 'total_flos': 9781320487209132.0, 'train_loss': 0.037913934395197066, 'epoch': 25.0})

In [21]:
torch.save(model.state_dict(), "model_saved_state_dict")