# HyFlexPIM LLaMA3 Notebook

This notebook uses functions and classes from `hyflex_utils.py`.

In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM, DataCollatorForLanguageModeling, get_scheduler, LlamaTokenizerFast, DataCollatorWithPadding
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from itertools import chain
import torch
import math
from tqdm import tqdm
import os
from torch import nn
import torch.nn.functional as F
from peft import get_peft_model, LoraConfig, TaskType
import torch.nn as nn
import time
import copy
from hyflex_utils import *  # Import all necessary functions/classes
import matplotlib.pyplot as plt

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 
torch.cuda.set_device(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("Available GPUs:", torch.cuda.device_count())
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Step 1. Load the models (Run all)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token 

model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
#     device_map="auto",
    torch_dtype=torch.bfloat16
).to(device)

datasets = load_dataset("ptb_text_only", "penn_treebank")
dataset_ = "ptb"

accelerator = Accelerator() 

column_names = datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]


with accelerator.main_process_first():
    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        desc="Running tokenizer on dataset",
    )
    
    
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=CausalDataCollator(tokenizer), batch_size=1)
eval_dataloader = DataLoader(eval_dataset, shuffle=True, collate_fn=CausalDataCollator(tokenizer), batch_size=1)
model, train_dataloader, eval_dataloader = accelerator.prepare(model, train_dataloader, eval_dataloader)

model.to(torch.float32)

In [None]:
# Inference (You can skip this part)

evaluate_model(model, tokenizer, eval_dataloader)

In [4]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Change this if you are using a different task
    r=8,
    lora_alpha=32,
    lora_dropout=0.1, 
)

# Wrap your model with LoRA
model = get_peft_model(model, lora_config)

param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer], 'lr': 5e-6}]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

Step 2. Train the original model

In [None]:
# Training original model (We are going to apply SVD on the best trained model)

train_model(model, tokenizer, optimizer, train_dataloader, eval_dataloader, accelerator, epochs=1, grad_accum_steps=1)

In [None]:
#  Save the model (Not necessary, but recommend to store your best model)

save_dir = f'./model_{dataset_}'
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), f'./model_{dataset_}/finetuned_best')
print("Model weights saved successfully!")

In [5]:
# Must run this

model = model.merge_and_unload()

Step 3. SVD decomposition

In [None]:
# Replace linear layer with SVD decomposed & traniner layer

replace_linear_layer_llama(model)

In [7]:
# Run this after replace_linear_layer

model, train_dataloader, eval_dataloader = accelerator.prepare(model, train_dataloader, eval_dataloader)

In [None]:
# Inference (To check accuarcy degradation after svd, not necessary)

evaluate_model(model, tokenizer, eval_dataloader)

Step 4. Fine tuning & Gradient redistribution

In [None]:
#  You can change lr
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

#  Trainable Parameters 
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print(f" Trainable Parameters Count: {len(trainable_params)}")


In [None]:
# Training SVD-ed model (w/ gradient redistribution)

train_model_llama_gradient_saving(model, tokenizer, optimizer, train_dataloader, eval_dataloader, accelerator, epochs=1, grad_accum_steps=1)

In [None]:
# Save the trained model!

torch.save(model.state_dict(), f'./model_{dataset_}/finetuned_best_after_svd')

print("Model weights after svd saved successfully!")

In [None]:
#  Load model (optional)

# model.load_state_dict(torch.load(f'./model_{dataset_}/finetuned_best_after_svd'))
model.load_state_dict(torch.load(f'../../ISCA_SVD/model_ptb/finetuned_best_after_svd_2'))

Step 5. Simulation loop for all SLC cases, and generating graph
 : You might change list of thresholds (which is SLC rate);
 e.g., thresholds = [0, 20] --> Simulation for 0% of SLC rate, and 20% of SLC rate --> generating plot figures! 

In [None]:
std = 0.025
thresholds = [0, 5, 10, 30, 40, 50, 100]
model_path = f'./model_{task}/finetuned_best_after_svd'

accuracies = []

for th in thresholds:
    print(f"\n==== [Threshold: {th}%] ====")
    model.load_state_dict(torch.load(model_path))
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    load_gradients(model)
    model = apply_noise_to_llama(model, std, th)
    loss, ppl = evaluate_llama(model, eval_dataloader, tokenizer)
    accuracies.append(1 / ppl) 
    print(f"[Threshold={th}%] Perplexity: {ppl:.2f}")


accuracy_percent = [a * 100 for a in accuracies]

plt.figure(figsize=(8, 5))
plt.bar([str(t) + "%" for t in thresholds], accuracy_percent, color='skyblue')
plt.title("Noise Injection Threshold vs Inverse Perplexity (LLaMA3)")
plt.xlabel("Noise Injection Threshold (%)")
plt.ylabel("1 / Perplexity × 100")
plt.ylim(0, max(accuracy_percent) * 1.1)
plt.grid(axis='y')
plt.tight_layout()


Step 5-. Noise Injection simulation with single threshold

In [None]:
std = 0.025  # Default is 0.025, but you can change based on your error rate
th = 25  # Default is 25 which means 25% of weights will be stored in SLC

load_gradients(model) # load gradient

model = apply_noise_to_llama(model, std, th) # Replace weights with noise injected weights. # Change clipping value in function get_clipping_value (inside hyflex_utils.py)

In [None]:
# Inference with noise injected model

evaluate_model(model, tokenizer, eval_dataloader)