In [1]:
import torch
import pyro
import tyxe

import random
import functools
import copy

import numpy as np

from pyro.infer import SVI, TraceMeanField_ELBO, Trace_ELBO

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

from torch.utils.data import Dataset, DataLoader, ConcatDataset, TensorDataset

from datasets import load_dataset  # Added to load SuperNI dataset

from typing import Optional, List
from model.mle_prior import MLEPrior


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
import torch

# Check if CUDA is available
print("CUDA Available:", torch.cuda.is_available())

# Get the current device index
current_device = torch.cuda.current_device()
print("Current Device Index:", current_device)

# Get the name of the current device
device_name = torch.cuda.get_device_name(current_device)
print("Current Device Name:", device_name)

# Get the number of GPUs
num_gpus = torch.cuda.device_count()
print("Number of GPUs:", num_gpus)

# List all GPUs
for device_id in range(num_gpus):
    print(f"Device {device_id}: {torch.cuda.get_device_name(device_id)}")


CUDA Available: True
Current Device Index: 0
Current Device Name: NVIDIA A100-SXM4-80GB
Number of GPUs: 1
Device 0: NVIDIA A100-SXM4-80GB


### Task1 -QA LoRA+EVCL

In [3]:
def compute_fisher_info(
    model, 
    data_loader, 
    prev_fisher_info=None, 
    ewc_gamma=1.0, 
    num_epochs=1, 
    head_modules=None, 
    n_samples=None
):

    fisher = {}
    
    # Initialize Fisher matrix for LoRA parameters, excluding head modules if provided
    for name, param in model.named_parameters():
        if 'lora' in name and (head_modules is None or not any(name.startswith(head) for head in head_modules)):
            fisher[name] = torch.zeros_like(param).to(DEVICE)
    
    # Save the model's current training state and set to eval
    old_training_state = model.training
    model.eval()
    
    scaler = GradScaler(device='cuda')

    batch_count = 0

    for epoch in range(num_epochs):
        print(f"Starting Epoch {epoch + 1}/{num_epochs}")
        for i, batch in enumerate(data_loader):
            if n_samples is not None and batch_count >= n_samples:
                break

            print(f"Processing batch {batch_count + 1}")
            model.zero_grad()
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            try:
                # with autocast(device_type='cuda'):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
            # scaler.scale(loss).backward()
            except RuntimeError as e:
                print(f"Error in batch {batch_count + 1}: {e}")
                break

            # Accumulate Fisher information for LoRA parameters
            for name, param in model.named_parameters():
                if 'lora' in name and param.grad is not None and (head_modules is None or not any(name.startswith(head) for head in head_modules)):
                    fisher[name] += param.grad.data ** 2

            print(f"Completed batch {batch_count + 1}")
            batch_count += 1

    # Normalize Fisher information by the number of processed batches or samples
    normalization_factor = batch_count if n_samples is None else min(batch_count, n_samples)
    for name in fisher:
        fisher[name] = fisher[name] / normalization_factor

    # Integrate previous Fisher information with EWC scaling
    if prev_fisher_info is not None:
        for name in fisher:
            if name in prev_fisher_info:
                fisher[name] += ewc_gamma * prev_fisher_info[name]

    # Restore the model's original training state
    model.train(old_training_state)
    
    return fisher

# Function to get variational posterior means
def get_variational_posterior_means(model):
    posterior_means = {}
    for name, module in model.named_modules():
        if hasattr(module, 'lora_A'):
            for key in module.lora_A:
                param_name = f"{name}.lora_A.{key}"
                loc_name = f"{param_name}_loc"
                if loc_name in pyro.get_param_store():
                    lora_A_loc = pyro.param(loc_name).detach().clone()
                    # Add '.weight' to the parameter name
                    posterior_means[f"{param_name}.weight"] = lora_A_loc
        if hasattr(module, 'lora_B'):
            for key in module.lora_B:
                param_name = f"{name}.lora_B.{key}"
                loc_name = f"{param_name}_loc"
                if loc_name in pyro.get_param_store():
                    lora_B_loc = pyro.param(loc_name).detach().clone()
                    # Add '.weight' to the parameter name
                    posterior_means[f"{param_name}.weight"] = lora_B_loc
    return posterior_means

In [4]:
from peft.tuners.lora import LoraLayer

In [5]:
import os
import torch
import zipfile
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from accelerate import init_empty_weights
from datasets import Dataset
from huggingface_hub import login
from peft.tuners.lora import LoraLayer
from pyro.nn.module import to_pyro_module_


def initialize_lora():
    login("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")
    # Set environment variable to manage memory fragmentation
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
     
    # Specify directories and the path to the zip file
    offload_dir = os.path.expanduser("llama_offload_evcl/")
     
    os.makedirs(offload_dir, exist_ok=True)
     
    # Extract only the specified JSON file from the zip archive
    os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Latest_Weights/QA_Weights')
    target_file = "task024_cosmosqa_answer_generation.json"
     
    # Load tokenizer from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    tokenizer.pad_token = tokenizer.eos_token


    # Load the model with accelerate's offloading and device map auto-setup
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3-8B",
            device_map="auto",
            # max_memory=max_memory,
            offload_folder=offload_dir,
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=True
        )
     
    # Configure LoRA with reduced rank
    lora_config = LoraConfig(
        r=4,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)

    #printing the trainable parameters
    model.print_trainable_parameters()

    # for name, param in model.named_parameters():
    #     if 'lora' in name:
    #         print(name)

    return model, tokenizer

    

In [6]:
print("Loading base model...")
model,tokenizer=initialize_lora()

Loading base model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 1,703,936 || all params: 8,031,965,184 || trainable%: 0.0212


In [7]:
os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Latest_Weights/QA_Weights')
target_file = "task024_cosmosqa_answer_generation.json"

with open(target_file, 'r', encoding='utf-8-sig') as f:
    json_data = json.load(f)

instances = json_data['Instances'][0:2223]
input_texts = [str(instance['input']) for instance in instances]
output_texts = [str(instance['output'][0]) if instance['output'] else "" for instance in instances]

# Create Hugging Face Dataset
ds = Dataset.from_dict({'input': input_texts, 'output': output_texts})

# Tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            truncation=True,
            padding="max_length",
            max_length=512
        )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = model_inputs.get("attention_mask", None)
    return model_inputs

# Apply tokenization and set format
tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_datasets.set_format("torch")

# Split dataset into train and eval
train_size = int(0.9 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Create DataLoaders
batch_size = 8  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/2223 [00:00<?, ? examples/s]



In [8]:
def save_trained_model(model, tokenizer, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    model.save_pretrained(output_dir)

    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved to {output_dir}")

In [9]:
def evaluate_model(model, eval_loader):
    model.eval()
    total_loss = 0.0
    num_batches = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            outputs = model(input_ids, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1
    avg_loss = total_loss / num_batches
    print(f"Evaluation Loss: {avg_loss:.4f}")

In [10]:
import pyro.distributions as dist
import pyro.poutine as poutine

def run_lora_evcl_1(
    num_epochs: int = 3,
    base_model_name: str = "meta-llama/Meta-Llama-3-8B",
    batch_size: int = 2,
    learning_rate: float = 1e-5,
    logging_steps: int = 100,
    eval_steps: int = 200,
    save_steps: int = 500,
    output_dir: str = "finetuned-weights-LoRA-EVCL",
):


    for name, param in model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False  # Freeze non-LoRA parameters

    def bayesian_guide(input_ids, attention_mask, labels):
        # Define variational distributions over the LoRA parameters
        for name, module in model.named_modules():
            if hasattr(module, 'lora_A'):
                for key in module.lora_A:
                    param_name = f"{name}.lora_A.{key}"
                    lora_A_param = module.lora_A[key].weight
                    device = lora_A_param.device

                    # Ensure initial values are leaf tensors with requires_grad=True
                    loc_init = lora_A_param.detach().clone().to(device).requires_grad_()
                    scale_init = (0.1 * torch.ones_like(lora_A_param)).to(device).requires_grad_()

                    loc = pyro.param(
                        f"{param_name}_loc",
                        loc_init
                    )
                    scale = pyro.param(
                        f"{param_name}_scale",
                        scale_init,
                        constraint=dist.constraints.positive
                    )
                    pyro.sample(
                        param_name,
                        dist.Normal(loc, scale).to_event(lora_A_param.dim())
                    )
            if hasattr(module, 'lora_B'):
                for key in module.lora_B:
                    param_name = f"{name}.lora_B.{key}"
                    lora_B_param = module.lora_B[key].weight
                    device = lora_B_param.device

                    # Ensure initial values are leaf tensors with requires_grad=True
                    loc_init = lora_B_param.detach().clone().to(device).requires_grad_()
                    scale_init = (0.1 * torch.ones_like(lora_B_param)).to(device).requires_grad_()

                    loc = pyro.param(
                        f"{param_name}_loc",
                        loc_init
                    )
                    scale = pyro.param(
                        f"{param_name}_scale",
                        scale_init,
                        constraint=dist.constraints.positive
                    )
                    pyro.sample(
                        param_name,
                        dist.Normal(loc, scale).to_event(lora_B_param.dim())
                    )
                    
    def bayesian_model(input_ids, attention_mask, labels):
        # Define a function to sample and substitute LoRA parameters
        def model_with_sampled_lora():
            # Sample LoRA parameters and set them in the model
            for name, module in model.named_modules():
                if hasattr(module, 'lora_A'):
                    for key in module.lora_A:
                        param_name = f"{name}.lora_A.{key}"
                        lora_A_module = module.lora_A[key]
                        device = lora_A_module.weight.device
    
                        # Sample from the prior
                        sampled_weight = pyro.sample(
                            param_name,
                            dist.Normal(
                                lora_A_module.weight.detach().to(device),
                                (0.1 * torch.ones_like(lora_A_module.weight)).to(device)
                            ).to_event(lora_A_module.weight.dim())
                        )
    
                        # Assign the sampled weight to the module
                        with torch.no_grad():
                            lora_A_module.weight.copy_(sampled_weight)
    
                if hasattr(module, 'lora_B'):
                    for key in module.lora_B:
                        param_name = f"{name}.lora_B.{key}"
                        lora_B_module = module.lora_B[key]
                        device = lora_B_module.weight.device
    
                        # Sample from the prior
                        sampled_weight = pyro.sample(
                            param_name,
                            dist.Normal(
                                lora_B_module.weight.detach().to(device),
                                (0.1 * torch.ones_like(lora_B_module.weight)).to(device)
                            ).to_event(lora_B_module.weight.dim())
                        )
    
                        # Assign the sampled weight to the module
                        with torch.no_grad():
                            lora_B_module.weight.copy_(sampled_weight)
    
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            return loss
    
        # Use the modified model with sampled LoRA parameters
        return model_with_sampled_lora()


    # Set up SVI
    pyro.clear_param_store()
    optim = pyro.optim.Adam({"lr": learning_rate})
    elbo = TraceMeanField_ELBO()
    svi = SVI(bayesian_model, bayesian_guide, optim, loss=elbo)

    
    # Training loop for Task 1
    print(f"Training on Task 1...")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        num_batches = 0
        for num_batches, batch in enumerate(train_loader, 1):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            loss = svi.step(input_ids, attention_mask, labels)
            total_loss += loss

            # Logging
            if num_batches % logging_steps == 0:
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch}, Step {num_batches}, Loss: {avg_loss}")

            # Evaluation
            if num_batches % eval_steps == 0:
                evaluate_model(model, eval_loader)

            # Save checkpoints
            if num_batches % save_steps == 0:
                save_trained_model(model, tokenizer, output_dir)

        avg_epoch_loss = total_loss / num_batches
        print(f"Task 1 Epoch {epoch} completed. Average Loss: {avg_epoch_loss}")

    # Save the final trained model after Task 1
    save_trained_model(model, tokenizer, output_dir)
    
    return model


In [11]:
print(os.getcwd())
os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/')
print(os.getcwd())

/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Latest_Weights/QA_Weights
/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting


In [12]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

if __name__ == '__main__':
    model=run_lora_evcl_1(
        num_epochs=3,
        base_model_name="meta-llama/Meta-Llama-3-8B",
        batch_size=2,
        learning_rate=1e-5,
        logging_steps=100,
        eval_steps=200,
        save_steps=500,
        output_dir="finetuned-weights-LoRA-EVCL",
    )

Training on Task 1...
Epoch 0, Step 100, Loss: 852033.34875
Epoch 0, Step 200, Loss: 852008.9590625
Evaluation Loss: 15.9033
Task 1 Epoch 0 completed. Average Loss: 852002.40025
Epoch 1, Step 100, Loss: 851907.44625
Epoch 1, Step 200, Loss: 851867.4834375
Evaluation Loss: 12.5456
Task 1 Epoch 1 completed. Average Loss: 851876.74775
Epoch 2, Step 100, Loss: 851841.92625
Epoch 2, Step 200, Loss: 851931.7409375
Evaluation Loss: 11.8873
Task 1 Epoch 2 completed. Average Loss: 851916.49525
Model and tokenizer saved to finetuned-weights-LoRA-EVCL


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
            device_map="auto",
            offload_folder='/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/llama_offload_evcl',
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=True)

lora_model_path = "/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/finetuned-weights-LoRA-EVCL"
model = PeftModel.from_pretrained(base_model, lora_model_path)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
for name, param in model.named_parameters():
    if 'lora' in name:
        param.requires_grad = True

for name, param in model.named_parameters():
    if 'lora' in name:
        print(f"{name}: requires_grad={param.requires_grad}")

base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: requires_grad=True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: requires_grad=True
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: requires_grad=True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: requires_grad=True
base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: requires_grad=True
base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: requires_grad=True
base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: requires_grad=True
base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: requires_grad=True
base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: requires_grad=True
base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: requires_grad=True
base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: requires_grad=True

In [14]:
from torch.amp import autocast, GradScaler
prev_fisher_info = None
prev_params = None
ewc_gamma = 1.0  

fisher_info = compute_fisher_info(
    model=model,
    data_loader=train_loader,
    prev_fisher_info=prev_fisher_info,
    ewc_gamma=ewc_gamma,
    num_epochs=1,  
    head_modules=None,  
    n_samples=None  
)


Starting Epoch 1/1
Processing batch 1
Completed batch 1
Processing batch 2
Completed batch 2
Processing batch 3
Completed batch 3
Processing batch 4
Completed batch 4
Processing batch 5
Completed batch 5
Processing batch 6
Completed batch 6
Processing batch 7
Completed batch 7
Processing batch 8
Completed batch 8
Processing batch 9
Completed batch 9
Processing batch 10
Completed batch 10
Processing batch 11
Completed batch 11
Processing batch 12
Completed batch 12
Processing batch 13
Completed batch 13
Processing batch 14
Completed batch 14
Processing batch 15
Completed batch 15
Processing batch 16
Completed batch 16
Processing batch 17
Completed batch 17
Processing batch 18
Completed batch 18
Processing batch 19
Completed batch 19
Processing batch 20
Completed batch 20
Processing batch 21
Completed batch 21
Processing batch 22
Completed batch 22
Processing batch 23
Completed batch 23
Processing batch 24
Completed batch 24
Processing batch 25
Completed batch 25
Processing batch 26
Comp

In [15]:
for name, fisher_matrix in fisher_info.items():
    print(f"Layer: {name}, Fisher Info Mean: {fisher_matrix.mean().item()}")

Layer: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight, Fisher Info Mean: 0.0010734123643487692
Layer: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight, Fisher Info Mean: 0.0008731464622542262
Layer: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight, Fisher Info Mean: 0.04731832444667816
Layer: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight, Fisher Info Mean: 0.13373570144176483
Layer: base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight, Fisher Info Mean: 0.00014075599028728902
Layer: base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight, Fisher Info Mean: 0.0007652479689568281
Layer: base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight, Fisher Info Mean: 0.027964968234300613
Layer: base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight, Fisher Info Mean: 0.25694388151168823
Layer: base_model.model.model.layers.2.self_attn.q_pro

In [16]:
prev_posterior_means = get_variational_posterior_means(model)
torch.save(prev_posterior_means, f'posterior_means_task_{1}.pt')

In [17]:
prev_posterior_means

{'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight': tensor([[-5.1930e-03,  1.4078e-02, -6.3000e-03,  ...,  1.2557e-02,
           5.9880e-03,  1.5532e-02],
         [ 2.9913e-03,  8.8218e-03,  1.2394e-02,  ..., -1.2724e-02,
          -1.2496e-02, -7.3794e-03],
         [-2.3136e-06,  6.4947e-03,  4.9435e-03,  ...,  3.8091e-03,
           1.7560e-03,  1.9770e-03],
         [-1.5878e-02,  8.6499e-03, -3.1998e-03,  ..., -7.6022e-04,
           2.6676e-03,  4.7972e-03]], device='cuda:0'),
 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight': tensor([[ 4.6435e-05,  3.8183e-05, -2.6476e-04, -7.5030e-05],
         [-1.0958e-04, -4.0557e-04,  8.1174e-05, -3.8721e-04],
         [-3.3745e-05,  2.2645e-04,  4.1145e-04, -1.3003e-04],
         ...,
         [-3.2613e-04,  4.0567e-04,  1.5837e-04, -6.0654e-04],
         [-4.2906e-04,  1.9606e-04,  1.9673e-04, -6.2774e-04],
         [-3.5415e-04,  2.8062e-04,  5.5335e-04, -2.8751e-04]], device='cuda:0'),
 '

### Task 2: QA+QG EVCL

In [18]:
import pyro.distributions as dist
import pyro.poutine as poutine
import torch
from pyro.infer import SVI, TraceMeanField_ELBO

def run_lora_evcl_2(
    num_epochs: int = 3,
    batch_size: int = 2,
    learning_rate: float = 1e-5,
    logging_steps: int = 100,
    eval_steps: int = 200,
    save_steps: int = 500,
    output_dir: str = "finetuned-weights-LoRA-EVCL-2",
    prev_fisher_info: dict = None,            
    prev_posterior_means: dict = None,        
    ewc_lambda: float = 0.0,                  
    synthetic_data_loader=None,               # Synthetic data from Task 1
    combined_loader=None,                     # Data loader for Task 2
    eval_loader=None,                         # Evaluation data loader
    tokenizer=None,
    model=None
):
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(DEVICE)

    # Ensure all parameters require gradients
    for name, param in model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False  # Freeze non-LoRA parameters

    def bayesian_guide(input_ids, attention_mask, labels):
        # Define variational distributions over the LoRA parameters
        for name, module in model.named_modules():
            if hasattr(module, 'lora_A'):
                for key in module.lora_A:
                    param_name = f"{name}.lora_A.{key}"
                    lora_A_param = module.lora_A[key].weight
                    device = lora_A_param.device

                    # Ensure initial values are leaf tensors with requires_grad=True
                    loc_init = lora_A_param.detach().clone().to(device).requires_grad_()
                    scale_init = (0.1 * torch.ones_like(lora_A_param)).to(device).requires_grad_()

                    loc = pyro.param(
                        f"{param_name}_loc",
                        loc_init
                    )
                    scale = pyro.param(
                        f"{param_name}_scale",
                        scale_init,
                        constraint=dist.constraints.positive
                    )
                    pyro.sample(
                        param_name,
                        dist.Normal(loc, scale).to_event(lora_A_param.dim())
                    )
            if hasattr(module, 'lora_B'):
                for key in module.lora_B:
                    param_name = f"{name}.lora_B.{key}"
                    lora_B_param = module.lora_B[key].weight
                    device = lora_B_param.device

                    # Ensure initial values are leaf tensors with requires_grad=True
                    loc_init = lora_B_param.detach().clone().to(device).requires_grad_()
                    scale_init = (0.1 * torch.ones_like(lora_B_param)).to(device).requires_grad_()

                    loc = pyro.param(
                        f"{param_name}_loc",
                        loc_init
                    )
                    scale = pyro.param(
                        f"{param_name}_scale",
                        scale_init,
                        constraint=dist.constraints.positive
                    )
                    pyro.sample(
                        param_name,
                        dist.Normal(loc, scale).to_event(lora_B_param.dim())
                    )
                        
    def bayesian_model(input_ids, attention_mask, labels):
        # pyro.module("model", model)  # Removed

        # Define a function to sample and substitute LoRA parameters
        def model_with_sampled_lora():
            # Sample LoRA parameters and set them in the model
            for name, module in model.named_modules():
                if hasattr(module, 'lora_A'):
                    for key in module.lora_A:
                        param_name = f"{name}.lora_A.{key}"
                        lora_A_module = module.lora_A[key]
                        device = lora_A_module.weight.device

                        # Use posterior mean from Task 1 as prior mean
                        prior_mean = prev_posterior_means.get(param_name, lora_A_module.weight.detach().clone()).to(device)
                        prior_std = (0.1 * torch.ones_like(lora_A_module.weight)).to(device)

                        # Sample from the prior
                        sampled_weight = pyro.sample(
                            param_name,
                            dist.Normal(
                                prior_mean,
                                prior_std
                            ).to_event(lora_A_module.weight.dim())
                        )

                        # Assign the sampled weight to the module
                        with torch.no_grad():
                            lora_A_module.weight.copy_(sampled_weight)

                if hasattr(module, 'lora_B'):
                    for key in module.lora_B:
                        param_name = f"{name}.lora_B.{key}"
                        lora_B_module = module.lora_B[key]
                        device = lora_B_module.weight.device

                        # Use posterior mean from Task 1 as prior mean
                        prior_mean = prev_posterior_means.get(param_name, lora_B_module.weight.detach().clone()).to(device)
                        prior_std = (0.1 * torch.ones_like(lora_B_module.weight)).to(device)

                        # Sample from the prior
                        sampled_weight = pyro.sample(
                            param_name,
                            dist.Normal(
                                prior_mean,
                                prior_std
                            ).to_event(lora_B_module.weight.dim())
                        )

                        # Assign the sampled weight to the module
                        with torch.no_grad():
                            lora_B_module.weight.copy_(sampled_weight)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Add EWC penalty if previous Fisher info and posterior means are provided
            if prev_fisher_info is not None and prev_posterior_means is not None and ewc_lambda > 0.0:
                ewc_penalty = 0.0
                for name, param in model.named_parameters():
                    if 'lora' in name and name in prev_fisher_info:
                        fisher = prev_fisher_info[name].to(DEVICE)
                        prev_mean = prev_posterior_means[name].to(DEVICE)
                        ewc_penalty += (fisher * (param - prev_mean) ** 2).sum()
                loss += ewc_lambda * ewc_penalty

            return loss

        # Use the modified model with sampled LoRA parameters
        return model_with_sampled_lora()

    # Set up SVI
    pyro.clear_param_store()
    optim = pyro.optim.Adam({"lr": learning_rate})
    elbo = TraceMeanField_ELBO()
    svi = SVI(bayesian_model, bayesian_guide, optim, loss=elbo)

    # Training loop
    print("Training on new task with EWC and synthetic data from previous task...")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        num_batches = 0
        for num_batches, batch in enumerate(combined_loader, 1):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            loss = svi.step(input_ids, attention_mask, labels)
            total_loss += loss

            # Logging
            if num_batches % logging_steps == 0:
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch + 1}, Step {num_batches}, Loss: {avg_loss}")

            # Evaluation
            if num_batches % eval_steps == 0:
                evaluate_model(model, eval_loader)

            # Save checkpoints
            if num_batches % save_steps == 0:
                save_trained_model(model, tokenizer, output_dir)

        avg_epoch_loss = total_loss / num_batches
        print(f"Epoch {epoch + 1} completed. Average Loss: {avg_epoch_loss}")

    # Save the final trained model after the task
    save_trained_model(model, tokenizer, output_dir)
    pyro.get_param_store().save('pyro_param_store_task2.pt')
    return model


In [19]:
os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Latest_Weights/QA_QG_ Weights')
target_file = "task074_squad1.1_question_generation.json"

with open(target_file, 'r', encoding='utf-8-sig') as f:
    json_data = json.load(f)

instances = json_data['Instances'][0:2500]
input_texts = [str(instance['input']) for instance in instances]
output_texts = [str(instance['output'][0]) if instance['output'] else "" for instance in instances]

# Create Hugging Face Dataset
ds = Dataset.from_dict({'input': input_texts, 'output': output_texts})

# Tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            truncation=True,
            padding="max_length",
            max_length=512
        )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = model_inputs.get("attention_mask", None)
    return model_inputs

# Apply tokenization and set format
tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_datasets.set_format("torch")

# Split dataset into train and eval
train_size = int(0.9 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Create DataLoaders
batch_size = 8  
train_loader_2 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader_2 = DataLoader(eval_dataset, batch_size=batch_size)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [34]:
!pip install json_repair

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




#### Synthetic Data

In [20]:
import json_repair 
os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Synthethic_Data_Generation')
target_file = "qa.train.final_sampled.jsonl"

with open(target_file, 'r', encoding='utf-8-sig') as f:
    json_data = json_repair.loads(f.read())

instances = json_data
input_texts = [str(instance['input']) for instance in instances]
output_texts = [str(instance['output'][0]) if instance['output'] else "" for instance in instances]

# Create Hugging Face Dataset
ds = Dataset.from_dict({'input': input_texts, 'output': output_texts})
tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_datasets.set_format("torch")
train_size = int(1.0 * len(tokenized_datasets))
synthetic_train_dataset = tokenized_datasets.select(range(train_size))
batch_size = 8  
synthetic_loader_1 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


Map:   0%|          | 0/201 [00:00<?, ? examples/s]

In [21]:
print(os.getcwd())
os.chdir('/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/')
print(os.getcwd())

/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting/SSR/Synthethic_Data_Generation
/home/kakde2/cs-546/Iterative-SSR-and-EVCL-Catastrophic-Forgetting


In [22]:
from torch.utils.data import ConcatDataset, DataLoader

# Combine datasets
if synthetic_loader_1 is not None:
    combined_dataset = ConcatDataset([train_loader_2.dataset, synthetic_loader_1.dataset])
    combined_loader = DataLoader(combined_dataset, batch_size=batch_size, shuffle=True)
else:
    combined_loader = train_loader_2

In [23]:
ewc_lambda = 100.0
model_task_2=run_lora_evcl_2(
    num_epochs=3,
    batch_size=2,
    learning_rate=1e-5,
    logging_steps=100,
    eval_steps=200,
    save_steps=500,
    output_dir="finetuned-weights-LoRA-EVCL-Task2",
    prev_fisher_info=fisher_info,
    prev_posterior_means=prev_posterior_means,
    ewc_lambda=ewc_lambda,
    synthetic_data_loader=synthetic_loader_1,
    combined_loader=combined_loader,
    eval_loader=eval_loader,
    tokenizer=tokenizer,
    model=model
)

Training on new task with EWC and synthetic data from previous task...
Epoch 1, Step 100, Loss: 843486.794375
Epoch 1, Step 200, Loss: 847730.5909375
Evaluation Loss: 12.8290
Epoch 1, Step 300, Loss: 849076.033125
Epoch 1, Step 400, Loss: 849807.50875
Evaluation Loss: 15.4223
Epoch 1, Step 500, Loss: 850279.091625
Model and tokenizer saved to finetuned-weights-LoRA-EVCL-Task2
Epoch 1 completed. Average Loss: 850449.5449600355
Epoch 2, Step 100, Loss: 851994.78625
Epoch 2, Step 200, Loss: 852064.1734375
Evaluation Loss: 10.8797
Epoch 2, Step 300, Loss: 852052.5872916667
Epoch 2, Step 400, Loss: 852069.47703125
Evaluation Loss: 14.5681
Epoch 2, Step 500, Loss: 852065.829125
Model and tokenizer saved to finetuned-weights-LoRA-EVCL-Task2
Epoch 2 completed. Average Loss: 852064.1171181173
Epoch 3, Step 100, Loss: 851905.533125
Epoch 3, Step 200, Loss: 851910.8978125
Evaluation Loss: 15.9732
Epoch 3, Step 300, Loss: 851894.6825
Epoch 3, Step 400, Loss: 851918.4740625
Evaluation Loss: 15.9056