In [1]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [2]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from loguru import logger



In [3]:
def get_devices() -> list:
    """
    Returns a list of available torch devices.
    Prioritizes CUDA (GPU) if available, followed by MPS (Apple Silicon), 
    and defaults to CPU if neither are available.
    """
    devices = []
    
    if torch.cuda.is_available():
        # Add all available CUDA devices
        for i in range(torch.cuda.device_count()):
            device = torch.device(f"cuda:{i}")
            devices.append(device)
            logger.info(f"Using CUDA device: {torch.cuda.get_device_name(i)} (cuda:{i})")
    
    elif torch.backends.mps.is_available():
        # If CUDA is not available, add MPS device (Apple Silicon)
        device = torch.device("mps")
        devices.append(device)
        logger.info("Using MPS (Apple Silicon) device.")
    
    else:
        # If neither CUDA nor MPS are available, default to CPU
        device = torch.device("cpu")
        devices.append(device)
        logger.info("Using CPU device.")

    return devices

In [4]:
class Config:
    # Model Config
    model_id = "microsoft/deberta-v3-base" # "microsoft/deberta-v3-large"
    model_architecture_config = AutoConfig.from_pretrained(
        model_id, output_hidden_states=True
    )

    # Training Config
    batch_size = 4
    max_length = 1024 * 2 + 256
    num_workers = 2
    learning_rate = 5e-4
    epochs = 10
    accumulation_steps = 2  # Number of steps to accumulate gradients before an update


    # Hardware Config
    torch_device = get_devices()

    # Dataset
    dataset_file_path = (
        "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    )
    split_config = {
        "test_size": 0.2,
        "shuffle": True,
        "random_state": 10,
    }
    sample_only = False
    sample_size = 32

    # Labels:
    label2id = {
        "O": 0,
        "B-EMAIL": 1,
        "B-ID_NUM": 2,
        "B-NAME_STUDENT": 3,
        "B-PHONE_NUM": 4,
        "B-STREET_ADDRESS": 5,
        "B-URL_PERSONAL": 6,
        "B-USERNAME": 7,
        "I-ID_NUM": 8,
        "I-NAME_STUDENT": 9,
        "I-PHONE_NUM": 10,
        "I-STREET_ADDRESS": 11,
        "I-URL_PERSONAL": 12,
    }
    id2label = {
        "0": "O",
        "1": "B-EMAIL",
        "2": "B-ID_NUM",
        "3": "B-NAME_STUDENT",
        "4": "B-PHONE_NUM",
        "5": "B-STREET_ADDRESS",
        "6": "B-URL_PERSONAL",
        "7": "B-USERNAME",
        "8": "I-ID_NUM",
        "9": "I-NAME_STUDENT",
        "10": "I-PHONE_NUM",
        "11": "I-STREET_ADDRESS",
        "12": "I-URL_PERSONAL",
    }
    num_labels = len(label2id)


print("torch_device: ", Config.torch_device)

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

[32m2024-09-11 16:28:17.228[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_devices[0m:[36m14[0m - [1mUsing CUDA device: Tesla P100-PCIE-16GB (cuda:0)[0m


torch_device:  [device(type='cuda', index=0)]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    Config.model_id,
    use_fast=True,  # to avoid warnings
    clean_up_tokenization_spaces=False,  # to avoid warnings
    max_length=Config.max_length,
)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
df = pd.read_json(Config.dataset_file_path)

if Config.sample_only:
    df = df[0 : Config.sample_size]

df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [7]:
def replace_space(tokens_list):
    return ["[SPACE]" if x.isspace() else x for x in tokens_list]


def get_tokenized_tokens_length(text):
    return len(
        tokenizer(text, return_attention_mask=False, return_token_type_ids=False)[
            "input_ids"
        ]
    )


def data_preprocessing(df):
    df["tokens"] = df["tokens"].apply(replace_space)

    df["tokenized_tokens_length"] = df["full_text"].apply(
        lambda text: get_tokenized_tokens_length(text)
    )
    df = df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(
        drop=True
    )

    return df


df = data_preprocessing(df=df)
df

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,tokenized_tokens_length
0,13147,Think Twice and Make a Wise\n\nConcept Mapping...,"[Think, Twice, and, Make, a, Wise, [SPACE], Co...","[True, True, True, True, True, False, False, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",67
1,19614,Who are your target customers?\n\nPatients hav...,"[Who, are, your, target, customers, ?, [SPACE]...","[True, True, True, True, False, False, False, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",68
2,20900,Mind Mapping selection:\n\nMind mapping is us...,"[Mind, Mapping, [SPACE], selection, :, [SPACE]...","[True, True, False, False, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",76
3,11469,Construct Your Problems\n\nVisualization is on...,"[Construct, Your, Problems, [SPACE], Visualiza...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",81
4,18959,Reflection writing rubric it represents throug...,"[Reflection, writing, rubric, it, represents, ...","[True, True, True, True, True, True, False, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",84
...,...,...,...,...,...,...
6802,7745,Luis Gonzales Savitribai Phule Pune Universit...,"[Luis, Gonzales, [SPACE], Savitribai, Phule, P...","[True, True, False, True, True, True, True, Fa...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",2560
6803,9188,Design Thinking in Career Development and Coun...,"[Design, Thinking, in, Career, Development, an...","[True, True, True, True, True, True, True, Fal...","[O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAM...",2749
6804,10078,Overcoming Barriers - The Story of the Movie a...,"[Overcoming, Barriers, -, The, Story, of, the,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2831
6805,21720,"In this assignment, a reflective report will b...","[In, this, assignment, ,, a, reflective, repor...","[True, True, False, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2900


In [8]:
import plotly.express as px
fig = px.histogram(x=df["tokenized_tokens_length"])
fig.show()

In [9]:
if not Config.sample_only:
    Config.split_config["stratify"] = pd.cut(
        df["tokenized_tokens_length"], bins=10, labels=False
    )

train_df, test_df = train_test_split(df, **Config.split_config)
train_df.reset_index(inplace=True)
train_df = train_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df = test_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df.reset_index(inplace=True)
train_df.shape, test_df.shape

((5445, 7), (1362, 7))

In [10]:
def prepare_input(row, tokenizer):
    processed_text_tokens_list = []
    char_map = []
    label_char_map = {}

    for index in range(len(row["tokens"])):
        token = unidecode(row["tokens"][index])

        whitespace = row["trailing_whitespace"][index]
        label = row["labels"][index]

        processed_text_tokens_list.append(token)
        char_map.extend([index] * len(token))

        label_char_map[index] = label

        if whitespace:
            processed_text_tokens_list.append(" ")
            char_map.append(-1)

    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'char_map'.
    processed_text = "".join(processed_text_tokens_list)
    tokenized = tokenizer(
        processed_text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=Config.max_length,
    )

    length = len(tokenized.input_ids)

    tokenized_info = {
        **tokenized,
        "processed_text": processed_text,
        "length": length,
        "char_map": char_map,  # Now includes mapping to original tokens
        "label_char_map": label_char_map,
    }
    return tokenized_info


def get_labels(tokenized_info):
    label_list = []
    offset_map = tokenized_info["offset_mapping"]
    for index, offset_map_item in enumerate(offset_map):
        if offset_map_item == (0, 0):
            label_list.extend(["Start_End"])
            continue

        char_map_item = tokenized_info["char_map"][
            offset_map_item[0] : offset_map_item[1]
        ]
        char_map_item_filtered = [element for element in char_map_item if element != -1]

        label_item = set(
            [
                tokenized_info["label_char_map"][element]
                for element in char_map_item_filtered
            ]
        )

        if len(label_item) != 1:
            if tokenized_info["input_ids"][index] in [507]:
                label_item = "O"

            else:
                raise Exception(
                    "\n"
                    f"Token ID: {tokenized_info['input_ids'][index]}\n"
                    f"Token: {tokenizer.decode(tokenized_info['input_ids'][index])}\n"
                    f"Offset: {offset_map_item}\n"
                    f"Text: {tokenized_info['processed_text'][ offset_map_item[0] : offset_map_item[1] ]}\n"
                    f"Character Map: {char_map_item}\n"
                    f"Filtered Character Map {char_map_item_filtered}\n"
                    f"Labels: {label_item}"
                )

        label_list.extend(list(label_item))

    if len(label_list) != len(tokenized_info["input_ids"]):
        raise Exception("Error: Size of label_list and input_ids are not same.")
    return label_list

In [11]:
# Check - Test - Dataset
for index in tqdm(train_df.index):
    tokenized_info = prepare_input(train_df.iloc[index], tokenizer)
    label_item = get_labels(tokenized_info)

print("Awesome - Everything is fine")

100%|██████████| 5445/5445 [01:27<00:00, 62.46it/s]

Awesome - Everything is fine





In [12]:
class PII_Dataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index].to_dict()
        row.pop("tokenized_tokens_length")

        tokenized_info = prepare_input(self.df.iloc[index], tokenizer)
        label_item = get_labels(tokenized_info)
        tokenized_info["document_id"] = row.pop("document")
        tokenized_info["labels"] = label_item
        tokenized_info["label_ids"] = [
            0 if item == "Start_End" else Config.label2id[item] for item in label_item
        ]

        if len(tokenized_info["label_ids"]) != len(tokenized_info["input_ids"]):
            raise Exception(
                f"Error in tokenized_info - length of lavel_ids and input_ids are not same: {tokenized_info}"
            )
        return tokenized_info

In [13]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # List of keys to extract from each sample
        keys = [
            "document_id",
            "input_ids",
            "token_type_ids",
            "attention_mask",
#             "offset_mapping",
#             "processed_text",
#             "length",
#             "char_map",
#             "label_char_map",
#             "labels",
            "label_ids",
        ]

        # Populate the output dictionary using a loop
        output = {key: [sample[key] for sample in batch] for key in keys}

        batch_max = max([len(ids) for ids in output["input_ids"]])

        # Add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [
                list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id]
                for s in output["input_ids"]
            ]
            output["attention_mask"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]
            ]
            output["token_type_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]
            ]
#             output["offset_mapping"] = [
#                 list(s) + (batch_max - len(s)) * [(0, 0)]
#                 for s in output["offset_mapping"]
#             ]
            output["label_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["label_ids"]
            ]

        # Convert to tensors and move to the specified device
        keys = ["document_id", "input_ids", "attention_mask", "token_type_ids", "label_ids"]
        for key in keys:
            output[key] = torch.tensor(output[key], dtype=torch.long) # .to(Config.torch_device[0])

        return output

In [14]:
train_dataset = PII_Dataset(tokenizer, df=train_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    collate_fn=Collate(tokenizer),
    # num_workers=Config.num_workers,
    pin_memory=True,
    drop_last=False,
)

In [15]:
# Checking data loader
for item in tqdm(train_loader):
    if len(item["input_ids"]) != len(item["label_ids"]):
        raise Exception(
            "Error: length of input_ids and label_ids after padding are not same."
        )
    pass

item.keys()

100%|██████████| 1362/1362 [01:32<00:00, 14.75it/s]


dict_keys(['document_id', 'input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])

In [16]:
class PIIDetectionModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.num_labels = Config.model_architecture_config.num_labels

        self.model = AutoModel.from_pretrained(
            Config.model_id,
            ignore_mismatched_sizes=True,
            config=Config.model_architecture_config,
            # torch_dtype = "auto"
        )
        self.model.gradient_checkpointing_enable()
        self.model.resize_token_embeddings(len(tokenizer))
        self.dropout = torch.nn.Dropout(
            Config.model_architecture_config.hidden_dropout_prob
        )

        self.bilstm = torch.nn.LSTM(
            Config.model_architecture_config.hidden_size,
            (Config.model_architecture_config.hidden_size) // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )

        self.gru = torch.nn.GRU(
            Config.model_architecture_config.hidden_size,
            Config.model_architecture_config.hidden_size // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )
        
        self.lstm_gru_balance_weight = torch.nn.Parameter(
            torch.tensor(0.5), requires_grad=False
        )

        self.fc = torch.nn.Linear(
            Config.model_architecture_config.hidden_size, Config.num_labels
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )  # returns ['last_hidden_state', 'hidden_states']
        sequence_output = output[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        gru_output, _ = self.gru(sequence_output)

        rnn_output = (
            self.lstm_gru_balance_weight * lstm_output
            + (1 - self.lstm_gru_balance_weight) * gru_output
        )
        logits = self.fc(rnn_output)
        return logits

In [17]:
def convert_logits_to_labels(logits: torch.Tensor) -> torch.Tensor:
    """
    Convert logits into predicted labels for token classification.
    """
    probabilities = torch.softmax(logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=-1)
    return predicted_labels


def calculate_loss(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    # TODO: Loss function excluding CLS, Start and End tokens: https://chatgpt.com/c/66dbbd83-4930-8007-b247-0d73fc2ee9af
    """
    Calculate the cross-entropy loss for token classification using raw logits.
    """
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return loss


In [18]:
model = PIIDetectionModel().to(Config.torch_device[0])
model

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

PIIDetectionModel(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128001, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            

In [19]:
import torch
import torch.nn as nn

def count_trainable_parameters(model: nn.Module) -> int:
    """
    Count the number of trainable parameters in a PyTorch model.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def estimate_parameter_memory(model: nn.Module, dtype=torch.float32) -> float:
    """
    Estimate the memory required to store the parameters of a PyTorch model. The estimated memory in megabytes (MB).
    """
    num_params = count_trainable_parameters(model)
    
    # Memory per element in bytes, e.g., 4 bytes for float32, 2 bytes for float16
    bytes_per_element = torch.finfo(dtype).bits // 8
    
    # Total memory in bytes
    total_memory_bytes = num_params * bytes_per_element
    
    # Convert to megabytes (MB)
    total_memory_mb = total_memory_bytes / (1024 ** 2)
    
    return num_params, total_memory_mb

In [20]:
num_params, total_memory_mb = estimate_parameter_memory(model.model)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.bilstm)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.gru)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.fc)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")

Number of trainable parameters in GRU: 183755520
Estimated memory for GRU parameters: 700.972 MB

Number of trainable parameters in GRU: 7090176
Estimated memory for GRU parameters: 27.047 MB

Number of trainable parameters in GRU: 5317632
Estimated memory for GRU parameters: 20.285 MB

Number of trainable parameters in GRU: 9997
Estimated memory for GRU parameters: 0.038 MB



In [21]:
# # Checking data loader
# for item in tqdm(train_loader):
#     break


# # Chechking models forward pass
# model_output = model.forward(
#     input_ids=item["input_ids"].to(Config.torch_device[0]),
#     attention_mask=item["attention_mask"].to(Config.torch_device[0]),
#     token_type_ids=item["token_type_ids"].to(Config.torch_device[0]),
# )

# # from torchviz import make_dot
# # make_dot(model_output.last_hidden_state.mean(), params=dict(custom_model.named_parameters()))


# print(
#     "logits_shape: ",
#     model_output.shape,
#     "\ninput_ids_shape:",
#     item["input_ids"].shape,
#     "\noutput_labels_shape: ",
#     item["label_ids"].shape,
# )

# # Checking loss function
# loss, predicted_labels = calculate_loss(logits=model_output, labels=item["label_ids"].to(Config.torch_device[0]))
# loss

In [22]:
scaler = torch.GradScaler()
optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)

for epoch in range(Config.epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch + 1}/{Config.epochs}")
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    # Reset gradients at the start of the epoch
    optimizer.zero_grad()

    for step, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(Config.torch_device[0])
        attention_mask = batch["attention_mask"].to(Config.torch_device[0])
        token_type_ids = batch["token_type_ids"].to(Config.torch_device[0])
        labels = batch["label_ids"].to(Config.torch_device[0])

        # print(f"Token Length: {len(input_ids[0])}")

        with torch.amp.autocast(device_type=Config.torch_device[0].type):  # Mixed precision
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = calculate_loss(logits=logits, labels=labels)
            loss = loss / Config.accumulation_steps  # Normalize loss over accumulation steps

        scaler.scale(loss).backward()  # Scale loss and accumulate gradients

        # Step the optimizer every `accumulation_steps`
        if (step + 1) % Config.accumulation_steps == 0 or (step + 1) == len(train_loader):
            scaler.step(optimizer)  # Update model weights
            scaler.update()  # Update the scaler for next batch
            optimizer.zero_grad()  # Reset gradients after each update

        # Detach inputs and labels to avoid memory accumulation
        input_ids.detach()
        attention_mask.detach()
        token_type_ids.detach()
        labels.detach()

        total_loss += loss.item() * Config.accumulation_steps  # Accumulated loss for the batch
        progress_bar.set_postfix({"loss": f"{loss.item() * Config.accumulation_steps:.4f}"})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{Config.epochs}, Training Loss: {avg_train_loss:.4f}")


Epoch 1/10



`torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.

                                                                          

Epoch 1/10, Training Loss: 0.0137

Epoch 2/10


                                                                          

Epoch 2/10, Training Loss: 0.0062

Epoch 3/10


                                                                          

Epoch 3/10, Training Loss: 0.0062

Epoch 4/10


                                                                          

Epoch 4/10, Training Loss: 0.0061

Epoch 5/10


                                                                          

Epoch 5/10, Training Loss: 0.0063

Epoch 6/10


                                                                          

Epoch 6/10, Training Loss: 0.0061

Epoch 7/10


                                                                          

Epoch 7/10, Training Loss: 0.0062

Epoch 8/10


                                                                          

Epoch 8/10, Training Loss: 0.0062

Epoch 9/10


                                                                          

Epoch 9/10, Training Loss: 0.0061

Epoch 10/10


                                                                          

Epoch 10/10, Training Loss: 0.0060




In [23]:
def evaluate_model(model, dataloader, device):
    model.eval()
    total_eval_loss = 0
    all_preds = []
    all_labels = []
    
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device[0])
            attention_mask = batch["attention_mask"].to(device[0])
            token_type_ids = batch["token_type_ids"].to(device[0])
            labels = batch["label_ids"].to(device[0])

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = calculate_loss(logits=logits, labels=labels)
            total_eval_loss += loss.item()
            
            predicted_labels = convert_logits_to_labels(logits=logits)
            all_preds.extend(predicted_labels.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    avg_eval_loss = total_eval_loss / len(dataloader)
    print(f"Validation Loss: {avg_eval_loss:.4f}")

    print("Classification Report:")
    print(classification_report(all_labels, all_preds, zero_division=0))

In [24]:
# Evaluate the model - train set
evaluate_model(model, train_loader, device= Config.torch_device)

                                                               

Validation Loss: 0.0060
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   6876310
           1       0.00      0.00      0.00       238
           2       0.00      0.00      0.00       270
           3       0.00      0.00      0.00      1606
           4       0.00      0.00      0.00         5
           6       0.00      0.00      0.00      1397
           7       0.00      0.00      0.00        15
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00      1203
          10       0.00      0.00      0.00        39
          12       0.00      0.00      0.00         7

    accuracy                           1.00   6881093
   macro avg       0.09      0.09      0.09   6881093
weighted avg       1.00      1.00      1.00   6881093



In [25]:
test_dataset = PII_Dataset(tokenizer, df=test_df)

test_loader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    collate_fn=Collate(tokenizer),
    # num_workers=Config.num_workers,
    pin_memory=True,
    drop_last=False,
)

# Evaluate the model - test set
evaluate_model(model, test_loader, device= Config.torch_device)

                                                             

Validation Loss: 0.0071
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1730547
           1       0.00      0.00      0.00        27
           2       0.00      0.00      0.00        93
           3       0.00      0.00      0.00       461
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00       480
           7       0.00      0.00      0.00         6
           9       0.00      0.00      0.00       252
          11       0.00      0.00      0.00        28

    accuracy                           1.00   1731902
   macro avg       0.10      0.10      0.10   1731902
weighted avg       1.00      1.00      1.00   1731902



In [26]:
torch.save(model.state_dict(), "model_saved_state_dict")