In [1]:
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import pyro
import pyro.distributions as dist
from tqdm import tqdm

# Load the JSON file
file_path = "/home/pranav24/cs-546-project/SSR/Latest_Weights/QA_Weights/task024_cosmosqa_answer_generation.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Extract input-output pairs from JSON
instances = data["Instances"][2500:5000]
inputs = [instance["input"] for instance in instances]
outputs = [instance["output"][0] for instance in instances]

# Split the data into train and test sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    inputs, outputs, test_size=0.2, random_state=42
)

# Convert data to Hugging Face Dataset format
train_ds = Dataset.from_dict({"input": train_inputs, "output": train_outputs})
test_ds = Dataset.from_dict({"input": test_inputs, "output": test_outputs})

# Tokenizer setup
base_model_path = "meta-llama/Meta-Llama-3-8B"  # Replace with actual model path
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Check if tokenizer has a padding token, if not, set the eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    labels = tokenizer(
        examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
tokenized_train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_test_ds = test_ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])

# Convert datasets to PyTorch format
tokenized_train_ds.set_format("torch")
tokenized_test_ds.set_format("torch")

# Create DataLoaders
batch_size = 16  # Adjust as needed
train_loader = DataLoader(tokenized_train_ds, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(tokenized_test_ds, batch_size=batch_size)

# Define the model and load weights
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)
fine_tuned_weights_path = "/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Final-Task2_EVCL_best"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, fine_tuned_weights_path)
pyro.get_param_store().load('pyro_param_store_task2_vcl_best.pt')

# Ensure compatibility with the unchanged part of the code
DEVICE = model.device

# Generate predictions
predictions = []
references = []
sampled_weights_log = []  # Store sampled weights

print("Generating predictions:")

for i in tqdm(range(0, len(test_inputs), batch_size)):  # Loop in batches
    batch_inputs = test_inputs[i:i + batch_size]
    batch_references = test_outputs[i:i + batch_size]

    # Tokenize the inputs in a batch
    inputs_tokenized = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            # Apply Pyro parameters to LoRA layers
            for name, module in model.named_modules():
                if hasattr(module, "lora_A"):
                    for key in module.lora_A:
                        loc = pyro.param(f"{name}.lora_A.{key}_loc")
                        scale = pyro.param(f"{name}.lora_A.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_A.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_A[key].weight.data.copy_(sampled_weight)

                if hasattr(module, "lora_B"):
                    for key in module.lora_B:
                        loc = pyro.param(f"{name}.lora_B.{key}_loc")
                        scale = pyro.param(f"{name}.lora_B.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_B.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_B[key].weight.data.copy_(sampled_weight)

            # Generate predictions using the tokenized inputs
            generated_ids = model.generate(
                input_ids=inputs_tokenized["input_ids"],
                attention_mask=inputs_tokenized["attention_mask"],
                max_length=1000,  # Adjust as needed
                num_return_sequences=1,
                do_sample=True  # Optional: Sampling for diverse generations
            )

        # Decode generated IDs
        batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend(batch_predictions)
        references.extend(batch_references)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  state = torch.load(input_file, map_location)


Generating predictions:


  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|▎         | 1/32 [02:18<1:11:26, 138.28s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|▋         | 2/32 [04:37<1:09:28, 138.96s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_sid

In [5]:
# test_inputs=["Context: Then I drove all around to the used DVD stores to get season 2. ( which I found for 20 bucks! ) I have been watching a disk a night, and not going to bed until 3 am because I knew that the new season was starting soon and I didn't want to be behind. I just finished season 2 and was on the edge of my seat the whole last episode. Yesterday I taped what was on tv. Question: What does the narrator think about the price of the DVD set purchased?"]
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import pyro
import pyro.distributions as dist
from tqdm import tqdm

test_inputs=["Context: Good Old War and person L : I saw both of these bands Wednesday night , and they both blew me away . seriously . Good Old War is acoustic and makes me smile . I really can not help but be happy when I listen to them ; I think it 's the fact that they seemed so happy themselves when they played . Question: In the future , will this person go to see other bands play ?"]
batch_size=1
final_answer=[]

base_model_path = "meta-llama/Meta-Llama-3-8B"
# fine_tuned_weights_path = "/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Final-Task1_VCL_best"

fine_tuned_weights_path="/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Test-Task1_VCL"
pyro.get_param_store().load('pyro_param_store_task1_vcl.pt')

tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Check if tokenizer has a padding token, if not, set the eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    labels = tokenizer(
        examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs
# pyro.clear_param_store()

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, fine_tuned_weights_path)
# pyro.get_param_store().load('pyro_param_store_task1_vcl_best.pt')
# pyro.get_param_store().load('pyro_param_store_task1_vcl_best.pt')

# Ensure compatibility with the unchanged part of the code
DEVICE = model.device

sampled_weights_log=[]
for i in tqdm(range(0, len(test_inputs), batch_size)):  # Loop in batches
    batch_inputs = test_inputs[i:i + batch_size]
    # batch_references = test_outputs[i:i + batch_size]

    # Tokenize the inputs in a batch
    inputs_tokenized = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            # Apply Pyro parameters to LoRA layers
            for name, module in model.named_modules():
                if hasattr(module, "lora_A"):
                    for key in module.lora_A:
                        loc = pyro.param(f"{name}.lora_A.{key}_loc")
                        scale = pyro.param(f"{name}.lora_A.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_A.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_A[key].weight.data.copy_(sampled_weight)

                if hasattr(module, "lora_B"):
                    for key in module.lora_B:
                        loc = pyro.param(f"{name}.lora_B.{key}_loc")
                        scale = pyro.param(f"{name}.lora_B.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_B.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_B[key].weight.data.copy_(sampled_weight)

            # Generate predictions using the tokenized inputs
            print(sampled_weights_log)
            generated_ids = model.generate(
                input_ids=inputs_tokenized["input_ids"],
                attention_mask=inputs_tokenized["attention_mask"],
                max_length=512,  # Adjust as needed
                num_return_sequences=1,
                do_sample=True  # Optional: Sampling for diverse generations
            )

        # Decode generated IDs
        final_batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        final_answer.extend(final_batch_predictions)
        # predictions.extend(batch_predictions)
        # references.extend(batch_references)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[('base_model.model.model.layers.0.self_attn.q_proj', 'default', array([[ 0.01508446,  0.00691732, -0.00206139, ..., -0.01874033,
         0.00093975, -0.01867228],
       [-0.01415629,  0.01682051,  0.01093652, ..., -0.03167468,
         0.01108208, -0.00818059],
       [ 0.00769036,  0.00204184,  0.01396344, ...,  0.00184097,
        -0.00175042, -0.0193276 ],
       ...,
       [-0.01807272, -0.00940934, -0.00976349, ..., -0.00526288,
        -0.0085222 , -0.01160066],
       [-0.01728981, -0.03943449, -0.00465248, ...,  0.00386294,
         0.01211404,  0.00012855],
       [-0.01855789, -0.01124079, -0.01837003, ..., -0.00469926,
         0.00820015, -0.0033732 ]], dtype=float32)), ('base_model.model.model.layers.0.self_attn.q_proj', 'default', array([[-0.00938235, -0.01070148, -0.00369583, ...,  0.01296223,
        -0.00733977,  0.0155929 ],
       [-0.00110161,  0.00825712,  0.01850528, ...,  0.00538029,
         0.00203215,  0.00292506],
       [ 0.01198409, -0.01477975,  0.0007

100%|██████████| 1/1 [01:14<00:00, 74.74s/it]


In [6]:
final_answer

["Context: Good Old War and person L : I saw both of these bands Wednesday night, and they both blew me away. seriously. Good Old War is acoustic and makes me smile. I really can not help but be happy when I listen to them ; I think it's the fact that they seemed so happy themselves when they played. Question: In the future, will this person go to see other bands play? \nQuestion: What is the reason this person seems so happy?They are happy with their life. They are happy with their life, and that is what makes them happy. They are not happy because they are at a concert. They are happy because they are there with their friends. Related: They are happy because they are there with their friends. They are happy because they are there with their friends, and they are happy because they are there with their friends. They are happy because they are there with their friends, and they are happy because they are there with their friends. They are happy because they are there with their friends

In [13]:
import json

# Create a dictionary to store the data
data = {
    "batch_predictions": batch_predictions,
    "predictions": predictions,
    "references": references
}

# Save to a JSON file
with open("predictions_EVCL_Task2.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

print("Data saved to predictions.json")



Data saved to predictions.json


In [23]:
file_path = "/home/pranav24/cs-546-project/SSR/Latest_Weights/QA_Weights/task024_cosmosqa_answer_generation.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Extract input-output pairs from JSON
instances = data["Instances"][2500:5000]
inputs = [instance["input"] for instance in instances]
outputs = [instance["output"][0] for instance in instances]

# Split the data into train and test sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    inputs, outputs, test_size=0.2, random_state=42
)

In [25]:
print(test_inputs[0])

Context: When Derek joined her , the door man unhooked the rope to let them through , still watching Lucy with undisguised lust as they walked in . A look of utter confusion shown on Derek 's features as they were granted entrance into the club . He draped an arm over Lucy 's shoulders and they made their way through the crowds of people . " So how 'd you do that ?. 
Question: What may be the reason for Derek 's confusion ?
