In [None]:
!pip install unidecode

In [None]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from loguru import logger

torch.cuda.memory._record_memory_history()

In [None]:
def get_devices() -> list:
    """
    Returns a list of available torch devices.
    Prioritizes CUDA (GPU) if available, followed by MPS (Apple Silicon), 
    and defaults to CPU if neither are available.
    """
    devices = []
    
    if torch.cuda.is_available():
        # Add all available CUDA devices
        for i in range(torch.cuda.device_count()):
            device = torch.device(f"cuda:{i}")
            devices.append(device)
            logger.info(f"Using CUDA device: {torch.cuda.get_device_name(i)} (cuda:{i})")
    
    elif torch.backends.mps.is_available():
        # If CUDA is not available, add MPS device (Apple Silicon)
        device = torch.device("mps")
        devices.append(device)
        logger.info("Using MPS (Apple Silicon) device.")
    
    else:
        # If neither CUDA nor MPS are available, default to CPU
        device = torch.device("cpu")
        devices.append(device)
        logger.info("Using CPU device.")

    return devices

In [None]:
class Config:
    # Model Config
    model_id = "microsoft/deberta-v3-base"
    model_architecture_config = AutoConfig.from_pretrained(
        model_id, output_hidden_states=True
    )

    # Training Config
    batch_size = 1
    max_length = 1024 * 2 + 256
    num_workers = 2

    # Hardware Config
    torch_device = get_devices()

    # Dataset
    dataset_file_path = (
        "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    )
    split_config = {
        "test_size": 0.2,
        "shuffle": True,
        "random_state": 10,
    }
    sample_only = False
    sample_size = 32

    # Labels:
    label2id = {
        "O": 0,
        "B-EMAIL": 1,
        "B-ID_NUM": 2,
        "B-NAME_STUDENT": 3,
        "B-PHONE_NUM": 4,
        "B-STREET_ADDRESS": 5,
        "B-URL_PERSONAL": 6,
        "B-USERNAME": 7,
        "I-ID_NUM": 8,
        "I-NAME_STUDENT": 9,
        "I-PHONE_NUM": 10,
        "I-STREET_ADDRESS": 11,
        "I-URL_PERSONAL": 12,
    }
    id2label = {
        "0": "O",
        "1": "B-EMAIL",
        "2": "B-ID_NUM",
        "3": "B-NAME_STUDENT",
        "4": "B-PHONE_NUM",
        "5": "B-STREET_ADDRESS",
        "6": "B-URL_PERSONAL",
        "7": "B-USERNAME",
        "8": "I-ID_NUM",
        "9": "I-NAME_STUDENT",
        "10": "I-PHONE_NUM",
        "11": "I-STREET_ADDRESS",
        "12": "I-URL_PERSONAL",
    }
    num_labels = len(label2id)


print("torch_device: ", Config.torch_device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    Config.model_id,
    use_fast=True,  # to avoid warnings
    clean_up_tokenization_spaces=False,  # to avoid warnings
    max_length=Config.max_length,
)

In [None]:
df = pd.read_json(Config.dataset_file_path)

if Config.sample_only:
    df = df[0 : Config.sample_size]

df.head(5)

In [None]:
def replace_space(tokens_list):
    return ["[SPACE]" if x.isspace() else x for x in tokens_list]


def get_tokenized_tokens_length(text):
    return len(
        tokenizer(text, return_attention_mask=False, return_token_type_ids=False)[
            "input_ids"
        ]
    )


def data_preprocessing(df):
    df["tokens"] = df["tokens"].apply(replace_space)

    df["tokenized_tokens_length"] = df["full_text"].apply(
        lambda text: get_tokenized_tokens_length(text)
    )
    df = df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(
        drop=True
    )

    return df


df = data_preprocessing(df=df)
df

In [None]:
df = df[df["tokenized_tokens_length"]<256]
df.shape

In [None]:
if not Config.sample_only:
    Config.split_config["stratify"] = pd.cut(
        df["tokenized_tokens_length"], bins=10, labels=False
    )

train_df, test_df = train_test_split(df, **Config.split_config)
train_df.reset_index(inplace=True)
train_df = train_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df = test_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df.reset_index(inplace=True)
train_df.shape, test_df.shape

In [None]:
def prepare_input(row, tokenizer):
    processed_text_tokens_list = []
    char_map = []
    label_char_map = {}

    for index in range(len(row["tokens"])):
        token = unidecode(row["tokens"][index])

        whitespace = row["trailing_whitespace"][index]
        label = row["labels"][index]

        processed_text_tokens_list.append(token)
        char_map.extend([index] * len(token))

        label_char_map[index] = label

        if whitespace:
            processed_text_tokens_list.append(" ")
            char_map.append(-1)

    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'char_map'.
    processed_text = "".join(processed_text_tokens_list)
    tokenized = tokenizer(
        processed_text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=Config.max_length,
    )

    length = len(tokenized.input_ids)

    tokenized_info = {
        **tokenized,
        "processed_text": processed_text,
        "length": length,
        "char_map": char_map,  # Now includes mapping to original tokens
        "label_char_map": label_char_map,
    }
    return tokenized_info


def get_labels(tokenized_info):
    label_list = []
    offset_map = tokenized_info["offset_mapping"]
    for index, offset_map_item in enumerate(offset_map):
        if offset_map_item == (0, 0):
            label_list.extend(["Start_End"])
            continue

        char_map_item = tokenized_info["char_map"][
            offset_map_item[0] : offset_map_item[1]
        ]
        char_map_item_filtered = [element for element in char_map_item if element != -1]

        label_item = set(
            [
                tokenized_info["label_char_map"][element]
                for element in char_map_item_filtered
            ]
        )

        if len(label_item) != 1:
            if tokenized_info["input_ids"][index] in [507]:
                label_item = "O"

            else:
                raise Exception(
                    "\n"
                    f"Token ID: {tokenized_info['input_ids'][index]}\n"
                    f"Token: {tokenizer.decode(tokenized_info['input_ids'][index])}\n"
                    f"Offset: {offset_map_item}\n"
                    f"Text: {tokenized_info['processed_text'][ offset_map_item[0] : offset_map_item[1] ]}\n"
                    f"Character Map: {char_map_item}\n"
                    f"Filtered Character Map {char_map_item_filtered}\n"
                    f"Labels: {label_item}"
                )

        label_list.extend(list(label_item))

    if len(label_list) != len(tokenized_info["input_ids"]):
        raise Exception("Error: Size of label_list and input_ids are not same.")
    return label_list

In [None]:
# Check - Test - Dataset
for index in tqdm(train_df.index):
    tokenized_info = prepare_input(train_df.iloc[index], tokenizer)
    label_item = get_labels(tokenized_info)

print("Awesome - Everything is fine")

In [None]:
class PII_Dataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index].to_dict()
        row.pop("tokenized_tokens_length")

        tokenized_info = prepare_input(self.df.iloc[index], tokenizer)
        label_item = get_labels(tokenized_info)
        tokenized_info["document_id"] = row.pop("document")
        tokenized_info["labels"] = label_item
        tokenized_info["label_ids"] = [
            0 if item == "Start_End" else Config.label2id[item] for item in label_item
        ]

        if len(tokenized_info["label_ids"]) != len(tokenized_info["input_ids"]):
            raise Exception(
                f"Error in tokenized_info - length of lavel_ids and input_ids are not same: {tokenized_info}"
            )
        return tokenized_info

In [None]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # List of keys to extract from each sample
        keys = [
            "document_id",
            "input_ids",
            "token_type_ids",
            "attention_mask",
#             "offset_mapping",
#             "processed_text",
#             "length",
#             "char_map",
#             "label_char_map",
#             "labels",
            "label_ids",
        ]

        # Populate the output dictionary using a loop
        output = {key: [sample[key] for sample in batch] for key in keys}

        batch_max = max([len(ids) for ids in output["input_ids"]])

        # Add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [
                list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id]
                for s in output["input_ids"]
            ]
            output["attention_mask"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]
            ]
            output["token_type_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]
            ]
#             output["offset_mapping"] = [
#                 list(s) + (batch_max - len(s)) * [(0, 0)]
#                 for s in output["offset_mapping"]
#             ]
            output["label_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["label_ids"]
            ]

        # Convert to tensors and move to the specified device
        keys = ["document_id", "input_ids", "attention_mask", "token_type_ids", "label_ids"]
        for key in keys:
            output[key] = torch.tensor(output[key], dtype=torch.long) # .to(Config.torch_device[0])

        return output

In [None]:
train_dataset = PII_Dataset(tokenizer, df=train_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    collate_fn=Collate(tokenizer),
    # num_workers=Config.num_workers,
    # pin_memory=True,
    drop_last=False,
)

In [None]:
# Checking data loader
for item in tqdm(train_loader):
    if len(item["input_ids"]) != len(item["label_ids"]):
        raise Exception(
            "Error: length of input_ids and label_ids after padding are not same."
        )
    pass

item.keys()

In [None]:
class PIIDetectionModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.num_labels = Config.model_architecture_config.num_labels

        self.model = AutoModel.from_pretrained(
            Config.model_id,
            ignore_mismatched_sizes=True,
            config=Config.model_architecture_config,
            # torch_dtype = "auto"
        )

        self.model.resize_token_embeddings(len(tokenizer))
        self.dropout = torch.nn.Dropout(
            Config.model_architecture_config.hidden_dropout_prob
        )

        self.bilstm = torch.nn.LSTM(
            Config.model_architecture_config.hidden_size,
            (Config.model_architecture_config.hidden_size) // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )

        self.gru = torch.nn.GRU(
            Config.model_architecture_config.hidden_size,
            Config.model_architecture_config.hidden_size // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )
        
        self.lstm_gru_balance_weight = torch.nn.Parameter(
            torch.tensor(0.5), requires_grad=False
        )

        self.fc = torch.nn.Linear(
            Config.model_architecture_config.hidden_size, Config.num_labels
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )  # returns ['last_hidden_state', 'hidden_states']
        sequence_output = output[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        gru_output, _ = self.gru(sequence_output)

        rnn_output = (
            self.lstm_gru_balance_weight * lstm_output
            + (1 - self.lstm_gru_balance_weight) * gru_output
        )
        print(f"rnn_output: {rnn_output.shape}")
        logits = self.fc(rnn_output)
        return logits

In [None]:
def convert_logits_to_labels(logits: torch.Tensor) -> torch.Tensor:
    """
    Convert logits into predicted labels for token classification.
    """
    probabilities = torch.softmax(logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=-1)
    return predicted_labels


def calculate_loss(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    # TODO: Loss function excluding CLS, Start and End tokens: https://chatgpt.com/c/66dbbd83-4930-8007-b247-0d73fc2ee9af
    """
    Calculate the cross-entropy loss for token classification using raw logits.
    """
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return loss


In [None]:
model = PIIDetectionModel().to(Config.torch_device[0])
model

In [None]:
import torch
import torch.nn as nn

def count_trainable_parameters(model: nn.Module) -> int:
    """
    Count the number of trainable parameters in a PyTorch model.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def estimate_parameter_memory(model: nn.Module, dtype=torch.float32) -> float:
    """
    Estimate the memory required to store the parameters of a PyTorch model. The estimated memory in megabytes (MB).
    """
    num_params = count_trainable_parameters(model)
    
    # Memory per element in bytes, e.g., 4 bytes for float32, 2 bytes for float16
    bytes_per_element = torch.finfo(dtype).bits // 8
    
    # Total memory in bytes
    total_memory_bytes = num_params * bytes_per_element
    
    # Convert to megabytes (MB)
    total_memory_mb = total_memory_bytes / (1024 ** 2)
    
    return num_params, total_memory_mb

In [None]:
num_params, total_memory_mb = estimate_parameter_memory(model.model)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.bilstm)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.gru)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.fc)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")

In [None]:
# # Checking data loader
# for item in tqdm(train_loader):
#     break


# # Chechking models forward pass
# model_output = model.forward(
#     input_ids=item["input_ids"].to(Config.torch_device[0]),
#     attention_mask=item["attention_mask"].to(Config.torch_device[0]),
#     token_type_ids=item["token_type_ids"].to(Config.torch_device[0]),
# )

# # from torchviz import make_dot
# # make_dot(model_output.last_hidden_state.mean(), params=dict(custom_model.named_parameters()))


# print(
#     "logits_shape: ",
#     model_output.shape,
#     "\ninput_ids_shape:",
#     item["input_ids"].shape,
#     "\noutput_labels_shape: ",
#     item["label_ids"].shape,
# )

# # Checking loss function
# loss, predicted_labels = calculate_loss(logits=model_output, labels=item["label_ids"].to(Config.torch_device[0]))
# loss

In [None]:
from torch.amp import autocast, GradScaler

learning_rate=5e-5
epochs=1

scaler = GradScaler() 

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch+1}/{epochs}")
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(Config.torch_device[0])
        attention_mask = batch["attention_mask"].to(Config.torch_device[0])
        token_type_ids = batch["token_type_ids"].to(Config.torch_device[0])
        labels = batch["label_ids"].to(Config.torch_device[0])

        with autocast(device_type=Config.torch_device[0].type):  # Enables mixed precision
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = calculate_loss(logits=logits, labels=labels)

        scaler.scale(loss).backward()  # Scales loss for better precision
        scaler.step(optimizer)  # Update model weights
        scaler.update()
        
        optimizer.zero_grad()
        
        input_ids.detach()
        attention_mask.detach()
        token_type_ids.detach()
        labels.detach()
        
        print("loss: ", loss.item())

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}")

In [None]:
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_loss = 0
    all_preds = []
    all_labels = []
    
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["label_ids"].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss, predicted_labels = calculate_loss(logits=logits, labels=labels)
            total_eval_loss += loss.item()

            all_preds.extend(predicted_labels.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    avg_eval_loss = total_eval_loss / len(dataloader)
    print(f"Validation Loss: {avg_eval_loss:.4f}")

    print("Classification Report:")
    print(classification_report(all_labels, all_preds, zero_division=0))


# Evaluate the model - train set
evaluate_model(model, train_loader, device= Config.torch_device)

In [None]:
test_dataset = PII_Dataset(tokenizer, df=test_df)

test_loader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    collate_fn=Collate(tokenizer),
    # num_workers=Config.num_workers,
    # pin_memory=True,
    drop_last=False,
)

# Evaluate the model - test set
evaluate_model(model, test_loader, device= Config.torch_device)

In [None]:
torch.cuda.memory._dump_snapshot("pytorch_gpu_ram_history.pickle")