# HyFlexPIM LLaMA3 Notebook

This notebook uses functions and classes from `hyflex_utils.py`.

In [1]:
from transformers import AutoTokenizer, LlamaForCausalLM, DataCollatorForLanguageModeling, get_scheduler, LlamaTokenizerFast, DataCollatorWithPadding
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from itertools import chain
import torch
import math
from tqdm import tqdm
import os
from torch import nn
import torch.nn.functional as F
from peft import get_peft_model, LoraConfig, TaskType
import torch.nn as nn
import time
import copy
from hyflex_utils import *  # Import all necessary functions/classes
from transformers import AutoModelForCausalLM, AutoTokenizer
from functools import partial

2025-04-05 00:56:15.719608: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-05 00:56:15.746247: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 
torch.cuda.set_device(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("Available GPUs:", torch.cuda.device_count())
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

CUDA_VISIBLE_DEVICES: 0,1
Available GPUs: 1
Device: cuda
Current cuda device: 0
Count of using GPUs: 1


Step 1. Load the models (Run all)

In [5]:
#################################
token = "inser your own token from hugging face"  # copy from huggingface.co/settings/tokens; e.g., token = "hn~~~"
#################################


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=token, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token 

model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    token=token,
    torch_dtype=torch.bfloat16
).to(device)

datasets = load_dataset("ptb_text_only", "penn_treebank")
dataset_ = "ptb"

accelerator = Accelerator() 

column_names = datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]


tokenize_fn = partial(tokenize_function, tokenizer=tokenizer, text_column_name=text_column_name)


with accelerator.main_process_first():
    tokenized_datasets = datasets.map(
        tokenize_fn,
        batched=True,
        remove_columns=column_names,
        desc="Running tokenizer on dataset"
    )
    
    
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=CausalDataCollator(tokenizer), batch_size=1)
eval_dataloader = DataLoader(eval_dataset, shuffle=True, collate_fn=CausalDataCollator(tokenizer), batch_size=1)
model, train_dataloader, eval_dataloader = accelerator.prepare(model, train_dataloader, eval_dataloader)

model.to(torch.float32)

Running tokenizer on dataset:   0%|          | 0/3761 [00:00<?, ? examples/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): Lla

In [6]:
# Inference (You can skip this part)

evaluate_model(model, tokenizer, eval_dataloader)

Evaluating:   0%|          | 0/3370 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Evaluating:   3%|▎         | 91/3370 [00:04<02:25, 22.53it/s]


KeyboardInterrupt: 

In [5]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Change this if you are using a different task
    r=8,
    lora_alpha=32,
    lora_dropout=0.1, 
)

# Wrap your model with LoRA
model = get_peft_model(model, lora_config)

param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer], 'lr': 5e-6}]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

Step 2. Train the original model

In [None]:
# Training original model (We are going to apply SVD on the best trained model)

train_model(model, tokenizer, optimizer, train_dataloader, eval_dataloader, accelerator, epochs=1, grad_accum_steps=1)

In [None]:
#  Save the model (Not necessary, but recommend to store your best model)

save_dir = f'./model_{dataset_}'
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), f'./model_{dataset_}/finetuned_best')
print("Model weights saved successfully!")

In [6]:
# Must run this

model = model.merge_and_unload()

Step 3. SVD decomposition

In [None]:
# Replace linear layer with SVD decomposed & traniner layer

replace_linear_layer_llama(model)

In [8]:
# Run this after replace_linear_layer

model, train_dataloader, eval_dataloader = accelerator.prepare(model, train_dataloader, eval_dataloader)

In [None]:
# Inference (To check accuarcy degradation after svd, not necessary)

evaluate_model(model, tokenizer, eval_dataloader)

Step 4. Fine tuning & Gradient redistribution

In [None]:
#  You can change lr
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

#  Trainable Parameters 
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print(f" Trainable Parameters Count: {len(trainable_params)}")


In [None]:
# Training SVD-ed model (w/ gradient redistribution)

train_model_gradient_saving(model, tokenizer, optimizer, train_dataloader, eval_dataloader, accelerator, epochs=1, grad_accum_steps=1)

In [None]:
# Save the trained model!

torch.save(model.state_dict(), f'./model_{dataset_}/finetuned_best_after_svd')

print("Model weights after svd saved successfully!")

In [None]:
#  Load model (optional)

# model.load_state_dict(torch.load(f'./model_{dataset_}/finetuned_best_after_svd'))

Step 5. Noise Injection simulation

In [None]:
std = 0.025  # Default is 0.025, but you can change based on your error rate
th = 25  # Default is 25 which means 25% of weights will be stored in SLC

load_gradients(model) # load gradient

model = apply_noise_to_llama(model, std, th) # Replace weights with noise injected weights. # Change clipping value in function get_clipping_value (inside hyflex_utils.py)

In [None]:
# Inference with noise injected model

evaluate_model(model, tokenizer, eval_dataloader)