In [34]:
import os
import warnings 
warnings.filterwarnings("ignore")

from datasets import load_dataset
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer)
from transformers import LlamaConfig
from transformers import LlamaForCausalLM
from transformers import LlamaTokenizer
from transformers import TextStreamer

import torch 
def fix_torch_seed(seed = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

In [20]:
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Version: {torch.version.cuda}")

device = "cuda" if torch.cuda.is_available() else "cpu"

GPU Name: Tesla T4
GPU Version: 12.6


# 1. Model Configuration

In [21]:
config = LlamaConfig()
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.3",
  "use_cache": true,
  "vocab_size": 32000
}



In [22]:
# Next, update parameters to change the model architecture
config.num_hidden_layers = 12
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_key_value_heads = 8
config.torch_dtype = "bfloat16"
config.use_cache = False
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.3",
  "use_cache": false,
  "vocab_size": 32000
}



# 2 Weight Initialization

We have different ways of initializing weights of a model for training

- Random Weight Initialization
- Using an existing model for continued pre-training
- Downscaling an existing model
- Upscaling an existing model

# Random Weight Initialization 

Randomly initializing model weights sets all weights to values from a truncated normal distribution with mean 0 and standard deviation 0.02. Values beyond 2-sigma from the mean are set to 0.

In [28]:
model = LlamaForCausalLM(config).to(device)
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (r

In [29]:
# "numel" this function calculates the product of all dimensions of a tensor, effectively providing its total size regardless of its shape or dimensionality'
def print_nparams(model):
    total_params = 0
    for p in model.parameters():
        total_params += p.numel()
    
    print(f"The total number of parameters is : {total_params}")


print_nparams(model)
# We see that's a lot parameters to train 

The total number of parameters is : 342385664


In [None]:
# First we will load the pretrained Llama Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "huggyllama/llama-7b"
)

input_ids = tokenizer("What is the most complicated maths topics?", 
                        return_tensors = "pt").to(device)

# Streamer 
streamer = TextStreamer(
    tokenizer, 
    skip_prompt = True, 
    skip_special_tokens = True
)


# Generate the output from the model
outputs = model.generate(
    **input_ids, 
    streamer = streamer, 
    use_cache = True, 
    max_new_tokens = 128, 
    do_sample = False
)

# Some gubrish has generated the output, through this we can understand that we have to training the model for longer time to get the better language understanding

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ा epleepleep alk alkleepleepleep alk alk alk teams });teamsweetााााा Items Items Items Items Items Items Itemsाा Items Items Itemsाााा Itemsा Items Items Items Items Items Items Items Itemsाााााा Items Items Items Items Items Items Items Items Musicalionali Musicalionali Musicalionali Musicalionali Musicalionali Musicalionali Musicalionali Musicalionali Musicalionali Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## Musical## назна## назна## назна назна назна назна назна назна назна
