In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

# load huggingface token
from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Enter the huggingface token: ")

from transformers import (AutoTokenizer,
                         AutoModelForCausalLM,
                         AutoConfig)
from datasets import load_dataset

In [2]:
from transformers import TextStreamer

In [3]:
import torch

if torch.cuda.is_available():
    print("GPU name", torch.cuda.get_device_name(0))
    print("GPU version", torch.version.cuda)

device = "cuda" if torch.cuda.is_available() else "cpu"

GPU name Tesla T4
GPU version 12.6


In [4]:
def memory_info():
    " Function to check the total memory available"
    total_memory = torch.cuda.get_device_properties(device).total_memory # Total memory of the GPUs
    allocated_memory = torch.cuda.memory_allocated(device) # currently allocated memory by tensors
    reserved_memory = torch.cuda.memory_reserved(device) # memory reserved by the caching allocator 

    # Free memory (within reserved)
    free_mem = reserved_memory - allocated_memory

    print(f"Total GPU Memory: {total_memory/ 1024**3:.2f} GB")
    print(f"Allocated GPU Memory: {allocated_memory / 1024**3:.2f} GB")
    print(f"Reserved Memory: {reserved_memory / 1024**3:.2f} GB")
    print(f"Free (within reserved): {free_mem / 1024**3:.2f} GB")

memory_info()

Total GPU Memory: 14.74 GB
Allocated GPU Memory: 0.00 GB
Reserved Memory: 0.00 GB
Free (within reserved): 0.00 GB


# 1. Load the model to be trained

In [5]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    'SanKav123/TinyBertModel',
    dtype = torch.bfloat16, 
    use_cache = False
).to(device)

# 2. Load Dataset

Here we will update two methods on the Dataset object to allow it to interface with the trainer.

In [25]:
from torch.utils.data import Dataset
from datasets import load_dataset

class CustomDataset(Dataset):
    def __init__(self, args, split="train"):
        self.args = args
        self.dataset = load_dataset(
            args.dataset_name,
            split = split
        )

    def __len__(self):
        "Returns the number of samples in the datasets."
        return len(self.dataset)

    def __getitem__(self, idx):
        "Retrieves a single data sample from the dataset at the specified index"
        input_ids = torch.LongTensor(self.dataset[idx]["input_ids"])
        labels = torch.LongTensor(self.dataset[idx]["input_ids"])

        # Return the sample as a dictionary
        return {"input_ids":input_ids, "labels":labels}


# 3. Configure Training Arguments


In [35]:
from dataclasses import dataclass, field
import transformers

@dataclass
class CustomArguments(transformers.TrainingArguments):
    dataset_name: str = field(
        default = 'SanKav123/packed_dataset.parquet' # Dataset configuration
    )
    num_proc: int = field(default = 1) # Number of subprocesses for data preprocessing
    max_seq_length: int = field(default = 32) # Maximum sequence length

    # Core training configuration
    seed: int = field(default = 0) # Random seed for initializing, ensuring reproduction
    optim: str = field(default = "adamw_torch") # Optimizer, here it's a AdamW implemented by pytorch
    max_steps: int = field(default = 200) # Number of maximum training steps
    per_device_train_batch_size: int = field(default = 2) # Batch size per device during training

    # Other training configuration
    learning_rate: float = field(default = 5e-5) # Initial learning rate for the optimizer
    weight_decay: float = field(default = 0)  # Weight decay
    warmup_steps: int = field(default = 10) # Number of steps for the learning rate warmup phase
    lr_scheduler_type: str = field(default = "linear") # Type of learning rate scheduler
    gradient_checkpointing: bool = field(default = True) # Enabling gradient checkpointing to save memory
    dataloader_num_workers: int = field(default = 2) # Number of subprocesses for data loading
    bf16: bool = field(default = True) # Use bfloat16 precision for training on supported hardware
    gradient_accumulation_steps: int = field(default = 1) # Number of steps to accumulate gradients before updating
    

    # logging configuration
    logging_steps: int = field(default = 3) # Frequency of logging training information
    report_to: str = field(default = "none") # Destination for logging (e.g.., WandB, TensorBoard)

    # Saving configuration
    # save_strategy: str = field(default="steps")          # Can be replaced with "epoch"
    # save_steps: int = field(default=3)                   # Frequency of saving training checkpoint
    # save_total_limit: int = field(default=2)             # The total number of checkpoints to be saved

    

In [37]:
# Parse the custom arguments and set the output directory where the model will be saved

parser = transformers.HfArgumentParser(CustomArguments)
args, = parser.parse_args_into_dataclasses(
    args = ["--output_dir", "output"]
)

# 4. Setup the Training

In [28]:
train_dataset = CustomDataset(args = args)

In [32]:
# Check the shape fo the dataset
print("Input Shape: ", train_dataset[0]["input_ids"].shape)

Input Shape:  torch.Size([32])


# 5. Run The trainer and monitor the loss

In [33]:
from transformers import Trainer, TrainingArguments, TrainerCallback

# Define a custom callback to log the loss value
class LossLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs = None, **kwargs):
        if logs is not None:
            self.logs.append(logs)

    def __init__(self):
        self.logs = []

# Initialize the callback
loss_logging_callback = LossLoggingCallback()

In [38]:
# Now, we can create an instance of the HuggingFace Trainer object from the transformers library. Call the train() method of the trainder to initialize the trainer run:

trainer = Trainer(
    model = pretrained_model, 
    args = args, 
    train_dataset = train_dataset, 
    eval_dataset = None, 
    callbacks = [loss_logging_callback]
)

trainer.train()

Step,Training Loss
3,5.4692
6,5.6164
9,4.4916
12,4.9011
15,6.0023
18,6.6213
21,7.5243
24,7.1726
27,7.3762
30,7.8966


TrainOutput(global_step=200, training_loss=6.521214308738709, metrics={'train_runtime': 42.289, 'train_samples_per_second': 9.459, 'train_steps_per_second': 4.729, 'total_flos': 21202285363200.0, 'train_loss': 6.521214308738709, 'epoch': 0.06668889629876626})

In [None]:
# from huggingface_hub import HfApi
# HfApi().delete_repo("SanKav123/PreTrainedModel")

from huggingface_hub import create_repo
create_repo(repo_id="SanKav123/output", repo_type="model", private=False)

RepoUrl('https://huggingface.co/SanKav123/output', endpoint='https://huggingface.co', repo_type='model', repo_id='SanKav123/output')

In [52]:
trainer.push_to_hub()

Saving model checkpoint to output
Configuration saved in output/config.json
Configuration saved in output/generation_config.json
Model weights saved in output/model.safetensors
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  .../output/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  .../output/model.safetensors:   7%|6         | 41.9MB /  618MB            

CommitInfo(commit_url='https://huggingface.co/SanKav123/output/commit/37ce4836248382eac7bcb4c7b92318c05a0a5b69', commit_message='End of training', commit_description='', oid='37ce4836248382eac7bcb4c7b92318c05a0a5b69', pr_url=None, repo_url=RepoUrl('https://huggingface.co/SanKav123/output', endpoint='https://huggingface.co', repo_type='model', repo_id='SanKav123/output'), pr_revision=None, pr_num=None)

In [53]:
# We will be using wordpiece which resembles BPE (Bit Pair Encoding)
model_path_or_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(
                                        model_path_or_name,
                                        bos_token = "[BOS]", # Define the BOS token string
                                        eos_token = "[EOS]", # Define the EOS token string
                                        use_fast = True
                                    )
print(tokenizer._tokenizer.model)
print(f"BOS token ID: {tokenizer.bos_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,


WordPiece(unk_token="[UNK]", continuing_subword_prefix="##", max_input_chars_per_word=100, vocab={"[PAD]":0, "[unused0]":1, "[unused1]":2, "[unused2]":3, "[unused3]":4, ...})
BOS token ID: 30522
EOS token ID: 30523


In [56]:
model_name_or_path = "SanKav123/output"
model2 = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,    
)


config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--SanKav123--output/snapshots/37ce4836248382eac7bcb4c7b92318c05a0a5b69/config.json
`torch_dtype` is deprecated! Use `dtype` instead!
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.3",
  "use_cache": false,
  "vocab_size": 32000
}



model.safetensors:   0%|          | 0.00/618M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--SanKav123--output/snapshots/37ce4836248382eac7bcb4c7b92318c05a0a5b69/model.safetensors
Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "use_cache": false
}



generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--SanKav123--output/snapshots/37ce4836248382eac7bcb4c7b92318c05a0a5b69/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "use_cache": false
}

Could not locate the custom_generate/generate.py inside SanKav123/output.


In [59]:
inputs

{'input_ids': tensor([[ 101, 1045, 2572, 2019, 3992, 1012, 1045, 2293,  102]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [61]:
prompt = "Can you talk about and operation"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
inputs.pop("token_type_ids", None)

streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True
)

outputs = model2.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=64,     
    do_sample=True,
    temperature=1.0,
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


a ). you = you is a $ ) $, \ frac { ^ and - 5 -. i was. are n ( is of the. that,, \ frac { ^ -. is jacobite. that that $ [unused573] 2 ) 1 $ $ ) $ $ $ $ $ \ frac {


In [66]:
tokens = model2.generate(**inputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [71]:
tokenizer.convert_ids_to_tokens(tokens[0])

['[CLS]',
 'can',
 'you',
 'talk',
 'about',
 'and',
 'operation',
 '[SEP]',
 'of',
 'the',
 ',',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you',
 'you']

In [72]:
# not that great results we have to keep training and process the data a lot