In [1]:
!pip install unidecode



In [2]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from loguru import logger

torch.cuda.memory._record_memory_history()



In [3]:
def get_devices() -> list:
    """
    Returns a list of available torch devices.
    Prioritizes CUDA (GPU) if available, followed by MPS (Apple Silicon), 
    and defaults to CPU if neither are available.
    """
    devices = []
    
    if torch.cuda.is_available():
        # Add all available CUDA devices
        for i in range(torch.cuda.device_count()):
            device = torch.device(f"cuda:{i}")
            devices.append(device)
            logger.info(f"Using CUDA device: {torch.cuda.get_device_name(i)} (cuda:{i})")
    
    elif torch.backends.mps.is_available():
        # If CUDA is not available, add MPS device (Apple Silicon)
        device = torch.device("mps")
        devices.append(device)
        logger.info("Using MPS (Apple Silicon) device.")
    
    else:
        # If neither CUDA nor MPS are available, default to CPU
        device = torch.device("cpu")
        devices.append(device)
        logger.info("Using CPU device.")

    return devices

In [4]:
class Config:
    # Model Config
    model_id = "microsoft/deberta-v3-base"
    model_architecture_config = AutoConfig.from_pretrained(
        model_id, output_hidden_states=True
    )

    # Training Config
    batch_size = 4
    max_length = 1024 * 2 + 256
    num_workers = 2

    # Hardware Config
    torch_device = get_devices()

    # Dataset
    dataset_file_path = (
        "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    )
    split_config = {
        "test_size": 0.2,
        "shuffle": True,
        "random_state": 10,
    }
    sample_only = False
    sample_size = 32

    # Labels:
    label2id = {
        "O": 0,
        "B-EMAIL": 1,
        "B-ID_NUM": 2,
        "B-NAME_STUDENT": 3,
        "B-PHONE_NUM": 4,
        "B-STREET_ADDRESS": 5,
        "B-URL_PERSONAL": 6,
        "B-USERNAME": 7,
        "I-ID_NUM": 8,
        "I-NAME_STUDENT": 9,
        "I-PHONE_NUM": 10,
        "I-STREET_ADDRESS": 11,
        "I-URL_PERSONAL": 12,
    }
    id2label = {
        "0": "O",
        "1": "B-EMAIL",
        "2": "B-ID_NUM",
        "3": "B-NAME_STUDENT",
        "4": "B-PHONE_NUM",
        "5": "B-STREET_ADDRESS",
        "6": "B-URL_PERSONAL",
        "7": "B-USERNAME",
        "8": "I-ID_NUM",
        "9": "I-NAME_STUDENT",
        "10": "I-PHONE_NUM",
        "11": "I-STREET_ADDRESS",
        "12": "I-URL_PERSONAL",
    }
    num_labels = len(label2id)


print("torch_device: ", Config.torch_device)

[32m2024-09-08 05:36:40.739[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_devices[0m:[36m14[0m - [1mUsing CUDA device: Tesla P100-PCIE-16GB (cuda:0)[0m


torch_device:  [device(type='cuda', index=0)]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    Config.model_id,
    use_fast=True,  # to avoid warnings
    clean_up_tokenization_spaces=False,  # to avoid warnings
    max_length=Config.max_length,
)



In [6]:
df = pd.read_json(Config.dataset_file_path)

if Config.sample_only:
    df = df[0 : Config.sample_size]

df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [7]:
def replace_space(tokens_list):
    return ["[SPACE]" if x.isspace() else x for x in tokens_list]


def get_tokenized_tokens_length(text):
    return len(
        tokenizer(text, return_attention_mask=False, return_token_type_ids=False)[
            "input_ids"
        ]
    )


def data_preprocessing(df):
    df["tokens"] = df["tokens"].apply(replace_space)

    df["tokenized_tokens_length"] = df["full_text"].apply(
        lambda text: get_tokenized_tokens_length(text)
    )
    df = df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(
        drop=True
    )

    return df


df = data_preprocessing(df=df)
df

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,tokenized_tokens_length
0,13147,Think Twice and Make a Wise\n\nConcept Mapping...,"[Think, Twice, and, Make, a, Wise, [SPACE], Co...","[True, True, True, True, True, False, False, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",67
1,19614,Who are your target customers?\n\nPatients hav...,"[Who, are, your, target, customers, ?, [SPACE]...","[True, True, True, True, False, False, False, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",68
2,20900,Mind Mapping selection:\n\nMind mapping is us...,"[Mind, Mapping, [SPACE], selection, :, [SPACE]...","[True, True, False, False, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",76
3,11469,Construct Your Problems\n\nVisualization is on...,"[Construct, Your, Problems, [SPACE], Visualiza...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",81
4,18959,Reflection writing rubric it represents throug...,"[Reflection, writing, rubric, it, represents, ...","[True, True, True, True, True, True, False, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",84
...,...,...,...,...,...,...
6802,7745,Luis Gonzales Savitribai Phule Pune Universit...,"[Luis, Gonzales, [SPACE], Savitribai, Phule, P...","[True, True, False, True, True, True, True, Fa...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",2560
6803,9188,Design Thinking in Career Development and Coun...,"[Design, Thinking, in, Career, Development, an...","[True, True, True, True, True, True, True, Fal...","[O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAM...",2749
6804,10078,Overcoming Barriers - The Story of the Movie a...,"[Overcoming, Barriers, -, The, Story, of, the,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2831
6805,21720,"In this assignment, a reflective report will b...","[In, this, assignment, ,, a, reflective, repor...","[True, True, False, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2900


In [8]:
df = df[df["tokenized_tokens_length"]<256]
df.shape

(242, 6)

In [9]:
if not Config.sample_only:
    Config.split_config["stratify"] = pd.cut(
        df["tokenized_tokens_length"], bins=10, labels=False
    )

train_df, test_df = train_test_split(df, **Config.split_config)
train_df.reset_index(inplace=True)
train_df = train_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df = test_df.sort_values(by="tokenized_tokens_length", ascending=True).reset_index(drop=True)
test_df.reset_index(inplace=True)
train_df.shape, test_df.shape

((193, 7), (49, 7))

In [10]:
def prepare_input(row, tokenizer):
    processed_text_tokens_list = []
    char_map = []
    label_char_map = {}

    for index in range(len(row["tokens"])):
        token = unidecode(row["tokens"][index])

        whitespace = row["trailing_whitespace"][index]
        label = row["labels"][index]

        processed_text_tokens_list.append(token)
        char_map.extend([index] * len(token))

        label_char_map[index] = label

        if whitespace:
            processed_text_tokens_list.append(" ")
            char_map.append(-1)

    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'char_map'.
    processed_text = "".join(processed_text_tokens_list)
    tokenized = tokenizer(
        processed_text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=Config.max_length,
    )

    length = len(tokenized.input_ids)

    tokenized_info = {
        **tokenized,
        "processed_text": processed_text,
        "length": length,
        "char_map": char_map,  # Now includes mapping to original tokens
        "label_char_map": label_char_map,
    }
    return tokenized_info


def get_labels(tokenized_info):
    label_list = []
    offset_map = tokenized_info["offset_mapping"]
    for index, offset_map_item in enumerate(offset_map):
        if offset_map_item == (0, 0):
            label_list.extend(["Start_End"])
            continue

        char_map_item = tokenized_info["char_map"][
            offset_map_item[0] : offset_map_item[1]
        ]
        char_map_item_filtered = [element for element in char_map_item if element != -1]

        label_item = set(
            [
                tokenized_info["label_char_map"][element]
                for element in char_map_item_filtered
            ]
        )

        if len(label_item) != 1:
            if tokenized_info["input_ids"][index] in [507]:
                label_item = "O"

            else:
                raise Exception(
                    "\n"
                    f"Token ID: {tokenized_info['input_ids'][index]}\n"
                    f"Token: {tokenizer.decode(tokenized_info['input_ids'][index])}\n"
                    f"Offset: {offset_map_item}\n"
                    f"Text: {tokenized_info['processed_text'][ offset_map_item[0] : offset_map_item[1] ]}\n"
                    f"Character Map: {char_map_item}\n"
                    f"Filtered Character Map {char_map_item_filtered}\n"
                    f"Labels: {label_item}"
                )

        label_list.extend(list(label_item))

    if len(label_list) != len(tokenized_info["input_ids"]):
        raise Exception("Error: Size of label_list and input_ids are not same.")
    return label_list

In [11]:
# Check - Test - Dataset
for index in tqdm(train_df.index):
    tokenized_info = prepare_input(train_df.iloc[index], tokenizer)
    label_item = get_labels(tokenized_info)

print("Awesome - Everything is fine")

100%|██████████| 193/193 [00:00<00:00, 212.22it/s]

Awesome - Everything is fine





In [12]:
class PII_Dataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer = tokenizer
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index].to_dict()
        row.pop("tokenized_tokens_length")

        tokenized_info = prepare_input(self.df.iloc[index], tokenizer)
        label_item = get_labels(tokenized_info)
        tokenized_info["document_id"] = row.pop("document")
        tokenized_info["labels"] = label_item
        tokenized_info["label_ids"] = [
            0 if item == "Start_End" else Config.label2id[item] for item in label_item
        ]

        if len(tokenized_info["label_ids"]) != len(tokenized_info["input_ids"]):
            raise Exception(
                f"Error in tokenized_info - length of lavel_ids and input_ids are not same: {tokenized_info}"
            )
        return tokenized_info

In [13]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # List of keys to extract from each sample
        keys = [
            "document_id",
            "input_ids",
            "token_type_ids",
            "attention_mask",
#             "offset_mapping",
#             "processed_text",
#             "length",
#             "char_map",
#             "label_char_map",
#             "labels",
            "label_ids",
        ]

        # Populate the output dictionary using a loop
        output = {key: [sample[key] for sample in batch] for key in keys}

        batch_max = max([len(ids) for ids in output["input_ids"]])

        # Add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [
                list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id]
                for s in output["input_ids"]
            ]
            output["attention_mask"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]
            ]
            output["token_type_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]
            ]
#             output["offset_mapping"] = [
#                 list(s) + (batch_max - len(s)) * [(0, 0)]
#                 for s in output["offset_mapping"]
#             ]
            output["label_ids"] = [
                list(s) + (batch_max - len(s)) * [0] for s in output["label_ids"]
            ]

        # Convert to tensors and move to the specified device
        keys = ["document_id", "input_ids", "attention_mask", "token_type_ids", "label_ids"]
        for key in keys:
            output[key] = torch.tensor(output[key], dtype=torch.long) # .to(Config.torch_device[0])

        return output

In [14]:
train_dataset = PII_Dataset(tokenizer, df=train_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True,
    collate_fn=Collate(tokenizer),
    # num_workers=Config.num_workers,
    pin_memory=True,
    drop_last=False,
)

In [15]:
# Checking data loader
for item in tqdm(train_loader):
    if len(item["input_ids"]) != len(item["label_ids"]):
        raise Exception(
            "Error: length of input_ids and label_ids after padding are not same."
        )
    pass

item.keys()

100%|██████████| 49/49 [00:01<00:00, 45.64it/s]


dict_keys(['document_id', 'input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])

In [22]:
class PIIDetectionModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.num_labels = Config.model_architecture_config.num_labels

        self.model = AutoModel.from_pretrained(
            Config.model_id,
            ignore_mismatched_sizes=True,
            config=Config.model_architecture_config,
            # torch_dtype = "auto"
        )
        self.model.gradient_checkpointing_enable()
        self.model.resize_token_embeddings(len(tokenizer))
        self.dropout = torch.nn.Dropout(
            Config.model_architecture_config.hidden_dropout_prob
        )

        self.bilstm = torch.nn.LSTM(
            Config.model_architecture_config.hidden_size,
            (Config.model_architecture_config.hidden_size) // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )

        self.gru = torch.nn.GRU(
            Config.model_architecture_config.hidden_size,
            Config.model_architecture_config.hidden_size // 2,
            num_layers=2,
            dropout=Config.model_architecture_config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )
        
        self.lstm_gru_balance_weight = torch.nn.Parameter(
            torch.tensor(0.5), requires_grad=False
        )

        self.fc = torch.nn.Linear(
            Config.model_architecture_config.hidden_size, Config.num_labels
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )  # returns ['last_hidden_state', 'hidden_states']
        sequence_output = output[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        gru_output, _ = self.gru(sequence_output)

        rnn_output = (
            self.lstm_gru_balance_weight * lstm_output
            + (1 - self.lstm_gru_balance_weight) * gru_output
        )
        logits = self.fc(rnn_output)
        return logits

In [23]:
def convert_logits_to_labels(logits: torch.Tensor) -> torch.Tensor:
    """
    Convert logits into predicted labels for token classification.
    """
    probabilities = torch.softmax(logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=-1)
    return predicted_labels


def calculate_loss(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    # TODO: Loss function excluding CLS, Start and End tokens: https://chatgpt.com/c/66dbbd83-4930-8007-b247-0d73fc2ee9af
    """
    Calculate the cross-entropy loss for token classification using raw logits.
    """
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return loss


In [24]:
model = PIIDetectionModel().to(Config.torch_device[0])
model

PIIDetectionModel(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128001, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            

In [25]:
import torch
import torch.nn as nn

def count_trainable_parameters(model: nn.Module) -> int:
    """
    Count the number of trainable parameters in a PyTorch model.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def estimate_parameter_memory(model: nn.Module, dtype=torch.float32) -> float:
    """
    Estimate the memory required to store the parameters of a PyTorch model. The estimated memory in megabytes (MB).
    """
    num_params = count_trainable_parameters(model)
    
    # Memory per element in bytes, e.g., 4 bytes for float32, 2 bytes for float16
    bytes_per_element = torch.finfo(dtype).bits // 8
    
    # Total memory in bytes
    total_memory_bytes = num_params * bytes_per_element
    
    # Convert to megabytes (MB)
    total_memory_mb = total_memory_bytes / (1024 ** 2)
    
    return num_params, total_memory_mb

In [26]:
num_params, total_memory_mb = estimate_parameter_memory(model.model)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.bilstm)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.gru)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")


num_params, total_memory_mb = estimate_parameter_memory(model.fc)
print(f"Number of trainable parameters in GRU: {num_params}")
print(f"Estimated memory for GRU parameters: {total_memory_mb:.3f} MB\n")

Number of trainable parameters in GRU: 183755520
Estimated memory for GRU parameters: 700.972 MB

Number of trainable parameters in GRU: 7090176
Estimated memory for GRU parameters: 27.047 MB

Number of trainable parameters in GRU: 5317632
Estimated memory for GRU parameters: 20.285 MB

Number of trainable parameters in GRU: 9997
Estimated memory for GRU parameters: 0.038 MB



In [27]:
# # Checking data loader
# for item in tqdm(train_loader):
#     break


# # Chechking models forward pass
# model_output = model.forward(
#     input_ids=item["input_ids"].to(Config.torch_device[0]),
#     attention_mask=item["attention_mask"].to(Config.torch_device[0]),
#     token_type_ids=item["token_type_ids"].to(Config.torch_device[0]),
# )

# # from torchviz import make_dot
# # make_dot(model_output.last_hidden_state.mean(), params=dict(custom_model.named_parameters()))


# print(
#     "logits_shape: ",
#     model_output.shape,
#     "\ninput_ids_shape:",
#     item["input_ids"].shape,
#     "\noutput_labels_shape: ",
#     item["label_ids"].shape,
# )

# # Checking loss function
# loss, predicted_labels = calculate_loss(logits=model_output, labels=item["label_ids"].to(Config.torch_device[0]))
# loss

In [29]:
from torch.amp import autocast, GradScaler

learning_rate = 5e-5
epochs = 1
accumulation_steps = 4  # Number of steps to accumulate gradients before an update

scaler = GradScaler()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch + 1}/{epochs}")
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    # Reset gradients at the start of the epoch
    optimizer.zero_grad()

    for step, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(Config.torch_device[0])
        attention_mask = batch["attention_mask"].to(Config.torch_device[0])
        token_type_ids = batch["token_type_ids"].to(Config.torch_device[0])
        labels = batch["label_ids"].to(Config.torch_device[0])

        print(f"Token Length: {len(input_ids[0])}")

        with autocast(device_type=Config.torch_device[0].type):  # Mixed precision
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = calculate_loss(logits=logits, labels=labels)
            loss = loss / accumulation_steps  # Normalize loss over accumulation steps

        scaler.scale(loss).backward()  # Scale loss and accumulate gradients

        # Step the optimizer every `accumulation_steps`
        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            scaler.step(optimizer)  # Update model weights
            scaler.update()  # Update the scaler for next batch
            optimizer.zero_grad()  # Reset gradients after each update

        # Detach inputs and labels to avoid memory accumulation
        input_ids.detach()
        attention_mask.detach()
        token_type_ids.detach()
        labels.detach()

        print("loss: ", loss.item() * accumulation_steps)  # Adjust the loss back

        total_loss += loss.item() * accumulation_steps  # Accumulated loss for the batch
        progress_bar.set_postfix({"loss": f"{loss.item() * accumulation_steps:.4f}"})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")


Epoch 1/1


Training:   0%|          | 0/49 [00:00<?, ?it/s]

Token Length: 350


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Training:   2%|▏         | 1/49 [00:00<00:33,  1.44it/s, loss=2.6371]

loss:  2.6371052265167236
Token Length: 319


Training:   4%|▍         | 2/49 [00:01<00:25,  1.87it/s, loss=2.6248]

loss:  2.6247506141662598
Token Length: 276


Training:   6%|▌         | 3/49 [00:01<00:21,  2.16it/s, loss=2.6017]

loss:  2.6016509532928467
Token Length: 324


Training:   8%|▊         | 4/49 [00:02<00:22,  2.04it/s, loss=2.6213]

loss:  2.6212730407714844
Token Length: 220


Training:  10%|█         | 5/49 [00:02<00:18,  2.38it/s, loss=2.2671]

loss:  2.2671186923980713
Token Length: 320


Training:  12%|█▏        | 6/49 [00:02<00:18,  2.34it/s, loss=2.2679]

loss:  2.2679474353790283
Token Length: 299


Training:  14%|█▍        | 7/49 [00:03<00:17,  2.38it/s, loss=2.2789]

loss:  2.2789130210876465
Token Length: 317


Training:  16%|█▋        | 8/49 [00:03<00:17,  2.32it/s, loss=2.2697]

loss:  2.269651412963867
Token Length: 292


Training:  18%|█▊        | 9/49 [00:04<00:16,  2.36it/s, loss=1.9707]

loss:  1.9706964492797852
Token Length: 306


Training:  20%|██        | 10/49 [00:04<00:16,  2.37it/s, loss=1.9490]

loss:  1.948979377746582
Token Length: 304


Training:  22%|██▏       | 11/49 [00:04<00:15,  2.38it/s, loss=1.9792]

loss:  1.9791597127914429
Token Length: 302


Training:  24%|██▍       | 12/49 [00:05<00:15,  2.34it/s, loss=2.0036]

loss:  2.0036354064941406
Token Length: 301


Training:  27%|██▋       | 13/49 [00:05<00:15,  2.38it/s, loss=1.6854]

loss:  1.6853837966918945
Token Length: 294


Training:  29%|██▊       | 14/49 [00:06<00:14,  2.41it/s, loss=1.7192]

loss:  1.7191652059555054
Token Length: 342


Training:  31%|███       | 15/49 [00:06<00:14,  2.34it/s, loss=1.6837]

loss:  1.6836830377578735
Token Length: 275


Training:  33%|███▎      | 16/49 [00:06<00:13,  2.38it/s, loss=1.7093]

loss:  1.709290623664856
Token Length: 325


Training:  35%|███▍      | 17/49 [00:07<00:13,  2.34it/s, loss=1.4141]

loss:  1.414137601852417
Token Length: 280


Training:  37%|███▋      | 18/49 [00:07<00:12,  2.42it/s, loss=1.4125]

loss:  1.412506103515625
Token Length: 291


Training:  39%|███▉      | 19/49 [00:08<00:12,  2.44it/s, loss=1.3809]

loss:  1.3808770179748535
Token Length: 283


Training:  41%|████      | 20/49 [00:08<00:12,  2.42it/s, loss=1.3822]

loss:  1.3821861743927002
Token Length: 335


Training:  43%|████▎     | 21/49 [00:09<00:11,  2.38it/s, loss=1.1420]

loss:  1.1419973373413086
Token Length: 657


Training:  45%|████▍     | 22/49 [00:09<00:15,  1.75it/s, loss=1.2019]

loss:  1.2018520832061768
Token Length: 322


Training:  47%|████▋     | 23/49 [00:10<00:13,  1.89it/s, loss=1.1519]

loss:  1.1519324779510498
Token Length: 319


Training:  49%|████▉     | 24/49 [00:10<00:12,  1.95it/s, loss=1.0967]

loss:  1.0967382192611694
Token Length: 330


Training:  51%|█████     | 25/49 [00:11<00:11,  2.02it/s, loss=0.8952]

loss:  0.8951634168624878
Token Length: 280


Training:  53%|█████▎    | 26/49 [00:11<00:10,  2.18it/s, loss=0.9094]

loss:  0.9093732833862305
Token Length: 276


Training:  55%|█████▌    | 27/49 [00:12<00:09,  2.29it/s, loss=0.8975]

loss:  0.8975078463554382
Token Length: 251


Training:  57%|█████▋    | 28/49 [00:12<00:08,  2.40it/s, loss=0.9586]

loss:  0.9585933685302734
Token Length: 304


Training:  59%|█████▉    | 29/49 [00:12<00:08,  2.41it/s, loss=0.7064]

loss:  0.706447422504425
Token Length: 242


Training:  61%|██████    | 30/49 [00:13<00:07,  2.53it/s, loss=0.7335]

loss:  0.7334791421890259
Token Length: 304


Training:  63%|██████▎   | 31/49 [00:13<00:07,  2.50it/s, loss=0.7026]

loss:  0.7026001811027527
Token Length: 270


Training:  65%|██████▌   | 32/49 [00:14<00:06,  2.47it/s, loss=0.6906]

loss:  0.6905978918075562
Token Length: 267


Training:  67%|██████▋   | 33/49 [00:14<00:06,  2.55it/s, loss=0.5385]

loss:  0.5384976267814636
Token Length: 270


Training:  69%|██████▉   | 34/49 [00:14<00:05,  2.57it/s, loss=0.5258]

loss:  0.5257629156112671
Token Length: 234


Training:  71%|███████▏  | 35/49 [00:15<00:05,  2.70it/s, loss=0.5558]

loss:  0.5557637214660645
Token Length: 346


Training:  73%|███████▎  | 36/49 [00:15<00:05,  2.46it/s, loss=0.5575]

loss:  0.5575355291366577
Token Length: 290


Training:  76%|███████▌  | 37/49 [00:16<00:04,  2.46it/s, loss=0.4267]

loss:  0.4267157316207886
Token Length: 284


Training:  78%|███████▊  | 38/49 [00:16<00:04,  2.51it/s, loss=0.4156]

loss:  0.41561782360076904
Token Length: 338


Training:  80%|███████▉  | 39/49 [00:16<00:04,  2.40it/s, loss=0.4624]

loss:  0.462415874004364
Token Length: 313


Training:  82%|████████▏ | 40/49 [00:17<00:03,  2.32it/s, loss=0.4497]

loss:  0.4497317969799042
Token Length: 361


Training:  84%|████████▎ | 41/49 [00:17<00:03,  2.27it/s, loss=0.3414]

loss:  0.3414402902126312
Token Length: 323


Training:  86%|████████▌ | 42/49 [00:18<00:03,  2.26it/s, loss=0.3164]

loss:  0.3164200484752655
Token Length: 360


Training:  88%|████████▊ | 43/49 [00:18<00:02,  2.21it/s, loss=0.3306]

loss:  0.33059149980545044
Token Length: 315


Training:  90%|████████▉ | 44/49 [00:19<00:02,  2.20it/s, loss=0.3286]

loss:  0.3285524547100067
Token Length: 343


Training:  92%|█████████▏| 45/49 [00:19<00:01,  2.21it/s, loss=0.2760]

loss:  0.276047945022583
Token Length: 307


Training:  94%|█████████▍| 46/49 [00:20<00:01,  2.25it/s, loss=0.2912]

loss:  0.29122108221054077
Token Length: 313


Training:  96%|█████████▌| 47/49 [00:20<00:00,  2.30it/s, loss=0.3467]

loss:  0.34672462940216064
Token Length: 336


Training:  98%|█████████▊| 48/49 [00:20<00:00,  2.21it/s, loss=0.2360]

loss:  0.23596426844596863
Token Length: 133


                                                                      

loss:  0.19346950948238373
Epoch 1/1, Training Loss: 1.1742




In [None]:
# def evaluate_model(model, dataloader, device):
#     model.eval()
#     total_eval_loss = 0
#     all_preds = []
#     all_labels = []
    
#     progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

#     with torch.no_grad():
#         for batch in progress_bar:
#             input_ids = batch["input_ids"].to(device[0])
#             attention_mask = batch["attention_mask"].to(device[0])
#             token_type_ids = batch["token_type_ids"].to(device[0])
#             labels = batch["label_ids"].to(device[0])

#             logits = model(input_ids, attention_mask, token_type_ids)
#             loss, predicted_labels = calculate_loss(logits=logits, labels=labels)
#             total_eval_loss += loss.item()

#             all_preds.extend(predicted_labels.cpu().numpy().flatten())
#             all_labels.extend(labels.cpu().numpy().flatten())

#     avg_eval_loss = total_eval_loss / len(dataloader)
#     print(f"Validation Loss: {avg_eval_loss:.4f}")

#     print("Classification Report:")
#     print(classification_report(all_labels, all_preds, zero_division=0))


# # Evaluate the model - train set
# evaluate_model(model, train_loader, device= Config.torch_device)

In [None]:
# test_dataset = PII_Dataset(tokenizer, df=test_df)

# test_loader = DataLoader(
#     test_dataset,
#     batch_size=Config.batch_size,
#     shuffle=True,
#     collate_fn=Collate(tokenizer),
#     # num_workers=Config.num_workers,
#     # pin_memory=True,
#     drop_last=False,
# )

# # Evaluate the model - test set
# evaluate_model(model, test_loader, device= Config.torch_device)

In [None]:
torch.cuda.memory._dump_snapshot("pytorch_gpu_ram_history.pickle")