In [2]:
import os
import warnings
warnings.filterwarnings("ignore")

# load huggingface token
from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Enter the huggingface token: ")

from transformers import (AutoTokenizer,
                         AutoModelForCausalLM,
                         AutoConfig)
from datasets import load_dataset

In [3]:
from transformers import TextStreamer

In [1]:
import torch

if torch.cuda.is_available():
    print("GPU name", torch.cuda.get_device_name(0))
    print("GPU version", torch.version.cuda)

device = "cuda" if torch.cuda.is_available() else "cpu"

GPU name Tesla T4
GPU version 12.6


In [4]:
def memory_info():
    " Function to check the total memory available"
    total_memory = torch.cuda.get_device_properties(device).total_memory # Total memory of the GPUs
    allocated_memory = torch.cuda.memory_allocated(device) # currently allocated memory by tensors
    reserved_memory = torch.cuda.memory_reserved(device) # memory reserved by the caching allocator 

    # Free memory (within reserved)
    free_mem = reserved_memory - allocated_memory

    print(f"Total GPU Memory: {total_memory/ 1024**3:.2f} GB")
    print(f"Allocated GPU Memory: {allocated_memory / 1024**3:.2f} GB")
    print(f"Reserved Memory: {reserved_memory / 1024**3:.2f} GB")
    print(f"Free (within reserved): {free_mem / 1024**3:.2f} GB")

memory_info()

Total GPU Memory: 14.74 GB
Allocated GPU Memory: 0.00 GB
Reserved Memory: 0.00 GB
Free (within reserved): 0.00 GB


# 1. Load the model to be trained

In [None]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    'SanKav123/TinyBertModel',
    dtype = torch.bfloat16, 
    use_cache = False
).to(device)

# 2. Load Dataset

Here we will update two methods on the Dataset object to allow it to interface with the trainer.

In [8]:
from torch.utils.data import Dataset
from datasets import load_dataset

class CustomDataset(Dataset):
    def __init__(self, args, split="train"):
        self.args = args
        self.dataset = load_dataset(
            "parquet", 
            data_files = args.dataset_name,
            split = split
        )

    def __len__(self):
        "Returns the number of samples in the datasets."
        return len(self.dataset)

    def __getitem__(self):
        "Retrieves a single data sample from the dataset at the specified index"
        input_ids = torch.LongTensor(self.dataset[idx]["input_ids"])
        labels = torch.LongTensor(self.dataset[idx]["input_ids"])

        # Return the sample as a dictionary
        return {"input_ids":input_ids, "labels":labels}


# 3. Configure Training Arguments


In [11]:
from dataclasses import dataclass, field
import transformers

@dataclass
class CustomArguments(transformers.TrainingArguments):
    dataset_name: str = field(
        default = './data/packed_dataset.parquet' # Dataset configuration
    )
    num_proc: int = field(default = 1) # Number of subprocesses for data preprocessing
    max_seq_length: int = field(default = 32) # Maximum sequence length

    # Core training configuration
    seed: int = field(default = 0) # Random seed for initializing, ensuring reproduction
    optim: str = field(default = "adamw_torch") # Optimizer, here it's a AdamW implemented by pytorch
    max_steps: int = field(default = 30) # Number of maximum training steps
    per_device_train_batch_size: int = field(default = 2) # Batch size per device during training

    # Other training configuration
    learning_rate: float = field(default = 5e-5) # Initial learning rate for the optimizer
    weight_decay: float = field(default = 0)  # Weight decay
    warmup_steps: int = field(default = 10) # Number of steps for the learning rate warmup phase
    lr_scheduler_type: str = field(default = "linear") # Type of learning rate scheduler
    gradient_checkpointing: bool = field(default = True) # Enabling gradient checkpointing to save memory
    dataloader_num_workers: int = field(default = 2) # Number of subprocesses for data loading
    bf16: bool = field(default = True) # Use bfloat16 precision for training on supported hardware
    gradient_accumulation_steps: int = field(default = 1) # Number of steps to accumulate gradients before updating
    

    # logging configuration
    logging_steps: int = field(default = 3) # Frequency of logging training information
    report_to: str = field(default = "none") # Destination for logging (e.g.., WandB, TensorBoard)

    # Saving configuration
    # save_strategy: str = field(default="steps")          # Can be replaced with "epoch"
    # save_steps: int = field(default=3)                   # Frequency of saving training checkpoint
    # save_total_limit: int = field(default=2)             # The total number of checkpoints to be saved

    

In [12]:
# Parse the custom arguments and set the output directory where the model will be saved

parser = transformers.HfArgumentParser(CustomArguments)
args, = parser.parse_args_into_dataclasses(
    args = ["--output_dir", "output"]
)

# 4. Setup the Training

In [13]:
train_dataset = CustomDataset(args = args)

FileNotFoundError: Unable to find '/content/./data/packed_dataset.parquet'