# Lesson 4: Preparing your model for training

In [1]:
# Ignore insignificant warnings (ex: deprecation warnings)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set a seed value for reproducibility
import torch

def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

## Model configuration

Let's configure models based on Meta's Llama family of models. The transformers library has several tools for working with these models, refer [here](https://huggingface.co/docs/transformers/main/en/model_doc/llama).

We will start by creating a `LlamaConfig` object to configure the architecture of the model.

In [3]:
from transformers import LlamaConfig
config = LlamaConfig()
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.51.3",
  "use_cache": true,
  "vocab_size": 32000
}



Next, update parameters to change the model architecture:

In [4]:
config.num_hidden_layers = 12      # reduced from 32 to 12
config.hidden_size = 1024          # reduced 1/4 from 4096 to 1024
config.intermediate_size = 4096    # reduced 1/3 from 11008 to 4096 (dimension of MLP representations)
config.num_key_value_heads = 8     # reduced 1/4 from 32 to 8 (defaults to num_attention_heads=32)
config.torch_dtype = "bfloat16"    # for half-precision training
config.use_cache = False           # `True` is incompatible w/ gradient checkpointing
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": false,
  "vocab_size": 32000
}



## Weight initialization

In the next sections, we'll try four different ways to initialize the weights of a model for training:
1. **Random weight initialization**
2. **Using an existing model for continued pre-training**
3. **Downscaling an existing model**
4. **Upscaling an existing model**

### Random weight initialization

Randomly initializing model weights sets all weights to values from a truncated normal distribution with mean 0 and standard deviation of 0.02. Values beyond 2-sigma from the mean are set to 0.

In [5]:
from transformers import LlamaForCausalLM
model = LlamaForCausalLM(config)
print(model)

2025-06-07 10:14:16.671621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749291256.696543     128 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749291256.703524     128 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb)

In [6]:
def print_nparams(model):
    """Calculate the total number of model parameters"""
    # For each parameter tensor p (like a matrix of weights), numel() 
    # returns the number of elements in that tensor
    
    nparams = sum(p.numel() for p in model.parameters())
    print(f"The total number of parameters is: {nparams}")

print_nparams(model) 

The total number of parameters is: 342385664


In [7]:
# Take a look at a sample of the weights in a single layer.
layer_name = "model.layers.0.self_attn.q_proj.weight"

for name, param in model.named_parameters():
    if name == layer_name:
        print(f"First 30 weights of layer '{layer_name}':")
        print(param.data.view(-1)[:30])
        break

First 30 weights of layer 'model.layers.0.self_attn.q_proj.weight':
tensor([ 0.0217,  0.0204, -0.0008,  0.0087, -0.0089, -0.0291,  0.0166, -0.0086,
         0.0004,  0.0017, -0.0089, -0.0095, -0.0135, -0.0160, -0.0148, -0.0131,
         0.0104,  0.0200,  0.0348,  0.0110,  0.0082, -0.0011, -0.0233, -0.0113,
         0.0087,  0.0267, -0.0030, -0.0272, -0.0098, -0.0089])


In [8]:
# Load a tokenizer from Upstage Solar, 
# which is compatible with the Llama-2 tokenizer
from transformers import LlamaTokenizer

model_dir = "upstage/SOLAR-10.7B-v1.0"
tokenizer = LlamaTokenizer.from_pretrained(model_dir)

# Run simple inference with prompt
from transformers import TextStreamer

prompt = "I am an engineer. I love"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True
)

outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=128, 
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ICK pom pom pom pom notable Heavy notable HeavyICKICKICKICK Heavy notable Heavy output notable Heavy output notable Heavy groundsICKICK groundsICK grounds Georg notable grounds grounds grounds grounds grounds grounds grounds grounds grounds很 similar很 similar很 similar grounds很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很很


> With random initialized weights, the output does not make sense. This is because the model, uptil now, is not aware of any language and just throw out random tokens.

In [9]:
# Remove the model from memory to avoid crashing the kernel
# NOTE: We're running large models in a limited environment. 
import gc
del model
del streamer
del outputs
gc.collect()

69

### Reuse general pretrained model weights

If you load an existing model, you can use it as is to continue pretraining on new data.

In [10]:
from transformers import AutoModelForCausalLM

model_name_or_path = "upstage/TinySolar-248m-4k"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [11]:
# Remove the model from memory to avoid crashing the kernel.
# NOTE: We're running large models in a limited environment. 
del model
gc.collect()

108

### Downscaling from a general pretrained model

Here we'll downscale the tinySolar-248m-4k model from a 12 layer model to a 10 layer model.

In [12]:
from transformers import AutoTokenizer, AutoConfig

model_name_or_path = "upstage/TinySolar-248m-4k"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [13]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

In [14]:
print_nparams(model)  # 248013824 => 248M

The total number of parameters is: 248013824


We will now try to remove the middle two layers (layers 5 and 6) and update the configuration.

In [15]:
layers = model.model.layers
model.model.layers = layers[:5] + layers[-5:]

config = AutoConfig.from_pretrained(
    model_name_or_path,    
    num_hidden_layers=len(model.model.layers),
)
model.config = config

print_nparams(model)  # 217601024 => 217M

The total number of parameters is: 217601024


Clear the memory to avoid crashing the kernel.

In [16]:
# NOTE: We're running large models in a limited environment.
import gc
del model
gc.collect()

141

### Depth Upscaling from a general pretrained model

Here we are going to upscale the tinySolar-248m-4k model from 12 layers to 16 layers. Here are steps to take,
1. Configure a 16 layer model and initialize it with random weights
2. Load the 12 layer tinySolar-248m-4k model into memory
3. Copy the bottom 8 and top 8 layers from the 12 layer model and use them to overwrite the random weights of the 16 layer model
4. Copy over the embedding and classifying layers to replace the randomly initialized counterparts in the 16 layer model

In [17]:
config = LlamaConfig(
    num_hidden_layers=16,  # We want our model to have 16 final layers
    hidden_size=1024,
    intermediate_size=4096,
    num_attention_heads=32,
    num_key_value_heads=8,
    torch_dtype="bfloat16",
    use_cache=False 
)
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": false,
  "vocab_size": 32000
}



In [18]:
model = LlamaForCausalLM(config)
model = model.to(device="cuda", dtype=torch.bfloat16) # convert to bfloat16
print_nparams(model)  # 308839424 => 308M

The total number of parameters is: 308839424


In [19]:
model_name_or_path = "upstage/TinySolar-248m-4k"
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,    
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

print_nparams(pretrained_model) #  248013824 => 248M

The total number of parameters is: 248013824


In [20]:
from copy import deepcopy

# [all layers except the last 4] + [all layers from index 4 onwards]
# this result in duplicating the middle layers, effectively modifying the
# architecture for experimentation.
model.model.layers = deepcopy(pretrained_model.model.layers[:-4]) \
    + deepcopy(pretrained_model.model.layers[4:])
model.model.layers = torch.nn.ModuleList([layer.to(model.device) for 
                                          layer in model.model.layers])

# copy the embedding layer
model.model.embed_tokens = deepcopy(pretrained_model.model.embed_tokens).to(model.device)

# copy the final output projection layer
model.lm_head = deepcopy(pretrained_model.lm_head).to(model.device)

print(model.config)

LlamaConfig {
  "_attn_implementation_autoset": true,
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "use_cache": false,
  "vocab_size": 32000
}



Check the number of parameters is still 308 million.

In [21]:
print_nparams(model)  # 308839424 => 308M

The total number of parameters is: 308839424


Try using the model for inference:

In [22]:
# Run simple inference to show no trained model
prompt = "I am an engineer. I love"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True,
    device = model.device
)

# with torch.no_grad():
#     output_ids = model.generate(
#         **inputs,
#         max_new_tokens=128,
#         do_sample=False
#     )

# output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# print(output_text)
outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=128, 
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to work with people who are not afraid to look at the world and are not afraid to look at the world with a little bit of a twist.
I am a very humble person and I am very fortunate to have a great team of people who work hard to make sure that I am a great role model for my family and friends.
I am very fortunate to have a great team of people who are very passionate about their work and I am very fortunate to have a great team of people who are very passionate about their work and I am very fortunate to have a great team of people who are very passionate about their work and I


### Save the model to disk

Note the new model name here which reflects the 308 million parameters of the new, upscaled model. 

In [23]:
model.save_pretrained('./data/TinySolar-308m-4k-init')